In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'protein',
    'postgres': False,
    'debug': False,
#     'max_docs': 3,
    'babbler_candidate_split': 0,
    'babbler_label_split': 2,
    'do_filter_duplicate_semantics': False, 
    'do_filter_consistency': False, 
    'do_filter_duplicate_signatures': False, 
    'do_filter_uniform_signatures': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein.db


In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=protein
Overwriting do_filter_duplicate_semantics=True to do_filter_duplicate_semantics=False
Overwriting babbler_label_split=0 to babbler_label_split=2
Overwriting do_filter_consistency=True to do_filter_consistency=False
Overwriting do_filter_uniform_signatures=True to do_filter_uniform_signatures=False
Overwriting do_filter_duplicate_signatures=True to do_filter_duplicate_signatures=False


In [None]:
%time pipe.parse()

Clearing existing...
Running UDF...

Documents: 53
Sentences: 1551
CPU times: user 4.74 s, sys: 541 ms, total: 5.28 s
Wall time: 42.6 s


Could not kill CoreNLP server [50258] [Errno 3] No such process


In [None]:
%time pipe.extract()

Clearing existing...
Running UDF...

Candidates [Split 0]: 664
Clearing existing...
Running UDF...

Candidates [Split 1]: 126
Clearing existing...
Running UDF...

Candidates [Split 2]: 203
CPU times: user 4.4 s, sys: 364 ms, total: 4.76 s
Wall time: 4.67 s


In [None]:
%time pipe.load_gold()

AnnotatorLabels created: 126
CPU times: user 5.54 s, sys: 63.9 ms, total: 5.61 s
Wall time: 5.68 s


In [None]:
%time pipe.collect()

Skipping malformed or header row 23...
Linking candidates...
Building list of target candidate ids...
Collected 29 unique target candidate ids from 56 explanations.
Gathering desired candidates...
Found 29/29 desired candidates
Linking explanations to candidates...
Linked 56/56 explanations
Calling babbler...
Created grammar with 480 rules
57 parses created from 41 out of 56 explanation(s)
Parsed 57 LFs from 56 explanations.
Clearing existing...
Running UDF...

CPU times: user 9.68 s, sys: 860 ms, total: 10.5 s
Wall time: 10.9 s


In [None]:
candidates = pipe.session.query(pipe.candidate_class).order_by(pipe.candidate_class.id).all()
candidate_ids_found = set(c.get_stable_id() for c in candidates)

In [None]:
explanations = pipe.explanations
candidate_ids_needed = set(e.candidate for e in explanations)

In [None]:
matches = candidate_ids_needed.intersection(candidate_ids_found)
misses = candidate_ids_needed - candidate_ids_found
print(len(matches))
print(len(misses))

needed: set([ProteinKinase(Span("Tau", sentence=66, chars=[146,148], words=[20,20]), Span("LRRK2", sentence=66, chars=[0,4], words=[0,0])), ProteinKinase(Span("4E-BP1", sentence=61, chars=[44,49], words=[7,7]), Span("LRRK2", sentence=61, chars=[13,17], words=[3,3])), ProteinKinase(Span("microtubule", sentence=74, chars=[157,167], words=[22,22]), Span("LRRK2", sentence=74, chars=[41,45], words=[6,6])), ProteinKinase(Span("microtubule", sentence=76, chars=[120,130], words=[22,22]), Span("LRRK2", sentence=76, chars=[163,167], words=[28,28])), ProteinKinase(Span("receptor", sentence=84, chars=[43,50], words=[8,8]), Span("LRRK2", sentence=84, chars=[30,34], words=[6,6])), ProteinKinase(Span("4E-BP1", sentence=62, chars=[31,36], words=[5,5]), Span("LRRK2", sentence=62, chars=[21,25], words=[3,3])), ProteinKinase(Span("4E-BP1", sentence=61, chars=[117,122], words=[19,19]), Span("LRRK2", sentence=61, chars=[64,68], words=[11,11])), ProteinKinase(Span("TUBB4", sentence=70, chars=[100,104], word

In [None]:
%time pipe.label()

Clearing existing...
Running UDF...


Labeled split 0: (664,57) sparse (nnz = 245)
Clearing existing...
Running UDF...


Labeled split 1: (126,57) sparse (nnz = 44)
                  j  Coverage  Overlaps  Conflicts  TP  FP  FN  TN  \
Explanation0_0    0  0.000000  0.000000   0.000000   0   0   0   0   
Explanation1_0    1  0.000000  0.000000   0.000000   0   0   0   0   
Explanation1_1    2  0.000000  0.000000   0.000000   0   0   0   0   
Explanation10_0   3  0.000000  0.000000   0.000000   0   0   0   0   
Explanation10_1   4  0.000000  0.000000   0.000000   0   0   0   0   
Explanation11_0   5  0.000000  0.000000   0.000000   0   0   0   0   
Explanation12_0   6  0.000000  0.000000   0.000000   0   0   0   0   
Explanation17_0   7  0.000000  0.000000   0.000000   0   0   0   0   
Explanation18_0   8  0.000000  0.000000   0.000000   0   0   0   0   
Explanation2_0    9  0.015873  0.015873   0.007937   0   2   0   0   
Explanation24_0  10  0.000000  0.000000   0.000000   0   0   0   

  ac = (tp+tn).astype(float) / (tp+tn+fp+fn)




Labeled split 2: (203,57) sparse (nnz = 16)
CPU times: user 12.4 s, sys: 471 ms, total: 12.9 s
Wall time: 12.8 s


In [None]:
%time pipe.supervise()

In [None]:
%time pipe.classify()