In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'protein',
    'postgres': False,
    'debug': False,
#     'max_docs': 3,
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'do_filter_duplicate_semantics': False, 
    'do_filter_consistency': False, 
    'do_filter_duplicate_signatures': False, 
    'do_filter_uniform_signatures': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein.db


In [5]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=protein
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting do_filter_duplicate_semantics=True to do_filter_duplicate_semantics=False
Overwriting babbler_label_split=0 to babbler_label_split=1
Overwriting do_filter_consistency=True to do_filter_consistency=False
Overwriting do_filter_uniform_signatures=True to do_filter_uniform_signatures=False
Overwriting do_filter_duplicate_signatures=True to do_filter_duplicate_signatures=False


In [None]:
%time pipe.parse()

In [None]:
%time pipe.extract()

In [None]:
%time pipe.load_gold()

In [12]:
%time pipe.collect()

Linking candidates...
Building list of target candidate ids...
Collected 30 unique target candidate ids from 61 explanations.
Gathering desired candidates...
Could not find 30 target candidates with the following stable_ids (first 5):
28582422::span:437:441~~28582422::span:381:386
28582422::span:886:890~~28582422::span:829:834
24275654::span:282:285~~24275654::span:194:199
24275654::span:1313:1325~~24275654::span:1259:1264
28582422::span:842:850~~28582422::span:829:834
Found 0/30 desired candidates
Linking explanations to candidates...
Linked 0/61 explanations
Calling babbler...
Created grammar with 480 rules
74 parses created from 47 out of 61 explanation(s)
Parsed 74 LFs from 61 explanations.
Clearing existing...
Running UDF...

CPU times: user 4.88 s, sys: 466 ms, total: 5.34 s
Wall time: 5.41 s


In [16]:
candidates = pipe.session.query(pipe.candidate_class).order_by(pipe.candidate_class.id).all()
candidate_ids_found = set(c.get_stable_id() for c in candidates)

In [18]:
explanations = pipe.explanations
candidate_ids_needed = set(e.candidate for e in explanations)

In [21]:
matches = candidate_ids_needed.intersection(candidate_ids_found)
misses = candidate_ids_needed - candidate_ids_found
print(len(matches))
print(len(misses))

0
25


In [None]:
%time pipe.label()

In [None]:
%time pipe.supervise()

In [None]:
%time pipe.classify()