In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'protein',
    'postgres': False,
    'debug': True,
#     'max_docs': 3,
    'babbler_candidate_split': 0,
    'babbler_label_split': 2,
    'do_filter_duplicate_semantics': False, 
    'do_filter_consistency': False, 
    'do_filter_duplicate_signatures': False, 
    'do_filter_uniform_signatures': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein_debug.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)


Overwriting domain=None to domain=protein
Overwriting do_filter_duplicate_semantics=True to do_filter_duplicate_semantics=False
Overwriting babbler_label_split=0 to babbler_label_split=2
Overwriting debug=False to debug=True
Overwriting do_filter_consistency=True to do_filter_consistency=False
Overwriting do_filter_uniform_signatures=True to do_filter_uniform_signatures=False
Overwriting do_filter_duplicate_signatures=True to do_filter_duplicate_signatures=False
NOTE: --debug=True: modifying parameters...


In [5]:
%time pipe.parse()

Clearing existing...
Running UDF...

Documents: 53
Sentences: 1551
CPU times: user 4.86 s, sys: 484 ms, total: 5.35 s
Wall time: 41.9 s


Could not kill CoreNLP server [40322] [Errno 3] No such process


In [6]:
%time pipe.extract()

('train, dev, test', set(['28582894', '28582912', '28582890', '28582899', '28582869', '28582822', '28582865', '28582920', '28582921', '28582922', '28582923', '28582847', '28582846', '28582845', '28582901', '28582928', '28582902', '28582849', '28582909', '28582884', '28582926', '28582880', '28582918', '28582819', '28582834', '28582835', '28582910', '28582839', '28582810', '28582917', '28582850', '28582855', '28582935', '28582934']), set(['28582893', '28582861', '28582838', '28582915', '28582933', '28582930', '28582927', '28582858', '28582859', '28582878']), set(['28582919', '28582895', '28582826', '28582911', '28582829', '28582906', '28582932', '28582852', '28582871', '28582939']))
Clearing existing...
Running UDF...

Candidates [Split 0]: 664
Clearing existing...
Running UDF...

Candidates [Split 1]: 126
Clearing existing...
Running UDF...

Candidates [Split 2]: 203
CPU times: user 4.32 s, sys: 361 ms, total: 4.68 s
Wall time: 4.49 s


In [7]:
%time pipe.load_gold()

AnnotatorLabels created: 126
CPU times: user 5.31 s, sys: 45 ms, total: 5.36 s
Wall time: 5.38 s


In [8]:
%time pipe.collect()

Skipping malformed or header row 23...
Linking candidates...
Building list of target candidate ids...
Collected 29 unique target candidate ids from 56 explanations.
Gathering desired candidates...
Found 29/29 desired candidates
Linking explanations to candidates...
Linked 56/56 explanations
Calling babbler...
Created grammar with 480 rules
57 parses created from 41 out of 56 explanation(s)
Parsed 57 LFs from 56 explanations.
Clearing existing...
Running UDF...

split is 2
: X is , cid_query.all(): [(791,), (792,), (793,), (794,), (795,), (796,), (797,), (798,), (799,), (800,), (801,), (802,), (803,), (804,), (805,), (806,), (807,), (808,), (809,), (810,), (811,), (812,), (813,), (814,), (815,), (816,), (817,), (818,), (819,), (820,), (821,), (822,), (823,), (824,), (825,), (826,), (827,), (828,), (829,), (830,), (831,), (832,), (833,), (834,), (835,), (836,), (837,), (838,), (839,), (840,), (841,), (842,), (843,), (844,), (845,), (846,), (847,), (848,), (849,), (850,), (851,), (852,), 

In [9]:
candidates = pipe.session.query(pipe.candidate_class).order_by(pipe.candidate_class.id).all()
candidate_ids_found = set(c.get_stable_id() for c in candidates)


In [10]:
explanations = pipe.explanations
candidate_ids_needed = set(e.candidate for e in explanations)

In [11]:
print("needed: {}\n found: {}".format(candidate_ids_needed,candidate_ids_found))

matches = candidate_ids_needed.intersection(candidate_ids_found)
misses = candidate_ids_needed - candidate_ids_found
print(len(matches))
print(len(misses))

needed: set([ProteinKinase(Span("Tau", sentence=66, chars=[146,148], words=[20,20]), Span("LRRK2", sentence=66, chars=[0,4], words=[0,0])), ProteinKinase(Span("4E-BP1", sentence=61, chars=[44,49], words=[7,7]), Span("LRRK2", sentence=61, chars=[13,17], words=[3,3])), ProteinKinase(Span("microtubule", sentence=74, chars=[157,167], words=[22,22]), Span("LRRK2", sentence=74, chars=[41,45], words=[6,6])), ProteinKinase(Span("microtubule", sentence=76, chars=[120,130], words=[22,22]), Span("LRRK2", sentence=76, chars=[163,167], words=[28,28])), ProteinKinase(Span("receptor", sentence=84, chars=[43,50], words=[8,8]), Span("LRRK2", sentence=84, chars=[30,34], words=[6,6])), ProteinKinase(Span("4E-BP1", sentence=62, chars=[31,36], words=[5,5]), Span("LRRK2", sentence=62, chars=[21,25], words=[3,3])), ProteinKinase(Span("4E-BP1", sentence=61, chars=[117,122], words=[19,19]), Span("LRRK2", sentence=61, chars=[64,68], words=[11,11])), ProteinKinase(Span("TUBB4", sentence=70, chars=[100,104], word

In [12]:
%time pipe.label()

Clearing existing...
Running UDF...

split is 0
: X is , cid_query.all(): [(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,), (14,), (15,), (16,), (17,), (18,), (19,), (20,), (21,), (22,), (23,), (24,), (25,), (26,), (27,), (28,), (29,), (30,), (31,), (32,), (33,), (34,), (35,), (36,), (37,), (38,), (39,), (40,), (41,), (42,), (43,), (44,), (45,), (46,), (47,), (48,), (49,), (50,), (51,), (52,), (53,), (54,), (55,), (56,), (57,), (58,), (59,), (60,), (61,), (62,), (63,), (64,), (65,), (66,), (67,), (68,), (69,), (70,), (71,), (72,), (73,), (74,), (75,), (76,), (77,), (78,), (79,), (80,), (81,), (82,), (83,), (84,), (85,), (86,), (87,), (88,), (89,), (90,), (91,), (92,), (93,), (94,), (95,), (96,), (97,), (98,), (99,), (100,), (101,), (102,), (103,), (104,), (105,), (106,), (107,), (108,), (109,), (110,), (111,), (112,), (113,), (114,), (115,), (116,), (117,), (118,), (119,), (120,), (121,), (122,), (123,), (124,), (125,), (126,), (127,), (128,), (129,), (

  ac = (tp+tn).astype(float) / (tp+tn+fp+fn)



split is 2
: X is , cid_query.all(): [(791,), (792,), (793,), (794,), (795,), (796,), (797,), (798,), (799,), (800,), (801,), (802,), (803,), (804,), (805,), (806,), (807,), (808,), (809,), (810,), (811,), (812,), (813,), (814,), (815,), (816,), (817,), (818,), (819,), (820,), (821,), (822,), (823,), (824,), (825,), (826,), (827,), (828,), (829,), (830,), (831,), (832,), (833,), (834,), (835,), (836,), (837,), (838,), (839,), (840,), (841,), (842,), (843,), (844,), (845,), (846,), (847,), (848,), (849,), (850,), (851,), (852,), (853,), (854,), (855,), (856,), (857,), (858,), (859,), (860,), (861,), (862,), (863,), (864,), (865,), (866,), (867,), (868,), (869,), (870,), (871,), (872,), (873,), (874,), (875,), (876,), (877,), (878,), (879,), (880,), (881,), (882,), (883,), (884,), (885,), (886,), (887,), (888,), (889,), (890,), (891,), (892,), (893,), (894,), (895,), (896,), (897,), (898,), (899,), (900,), (901,), (902,), (903,), (904,), (905,), (906,), (907,), (908,), (909,), (910,), (

In [13]:
%time pipe.supervise()

self.config splits [0, 1, 2]
split is 0
: X is , cid_query.all(): [(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,), (14,), (15,), (16,), (17,), (18,), (19,), (20,), (21,), (22,), (23,), (24,), (25,), (26,), (27,), (28,), (29,), (30,), (31,), (32,), (33,), (34,), (35,), (36,), (37,), (38,), (39,), (40,), (41,), (42,), (43,), (44,), (45,), (46,), (47,), (48,), (49,), (50,), (51,), (52,), (53,), (54,), (55,), (56,), (57,), (58,), (59,), (60,), (61,), (62,), (63,), (64,), (65,), (66,), (67,), (68,), (69,), (70,), (71,), (72,), (73,), (74,), (75,), (76,), (77,), (78,), (79,), (80,), (81,), (82,), (83,), (84,), (85,), (86,), (87,), (88,), (89,), (90,), (91,), (92,), (93,), (94,), (95,), (96,), (97,), (98,), (99,), (100,), (101,), (102,), (103,), (104,), (105,), (106,), (107,), (108,), (109,), (110,), (111,), (112,), (113,), (114,), (115,), (116,), (117,), (118,), (119,), (120,), (121,), (122,), (123,), (124,), (125,), (126,), (127,), (128,), (129,), (130,), (

AssertionError: 

In [None]:
%time pipe.classify()