In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'cdr',
    'splits': [0, 1, 2], # UNUSUAL
    'disc_model_class': 'logreg',
    'supervision': 'majority',
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_cdr.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting epochs=[25, 50, 75] to epochs=[10, 25, 50, 100]
Overwriting reg_param=[0.0, 0.01, 0.1, 0.25, 0.5] to reg_param=[0.0, 0.01, 0.1, 0.25, 0.5, 0.75]
Overwriting domain=None to domain=cdr
Overwriting print_freq=1 to print_freq=5
Overwriting init_class_prior=0 to init_class_prior=-0.695
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=majority


In [5]:
# %time pipe.parse()

In [6]:
# %time pipe.extract()

In [7]:
# %time pipe.load_gold()

In [8]:
%time pipe.featurize()

Clearing existing...
Running UDF...

Featurized split 0: (6667,132522) sparse (nnz = 492757)
Clearing existing...
Running UDF...

Featurized split 1: (773,132522) sparse (nnz = 33224)
Clearing existing...
Running UDF...

Featurized split 2: (4101,132522) sparse (nnz = 170346)
CPU times: user 13min 15s, sys: 5.18 s, total: 13min 20s
Wall time: 13min 22s


In [9]:
%time pipe.collect()

Loading canonical ID ontologies...
Finished loading canonical ID ontologies.
Linking candidates...
Building list of target candidate ids...
Collected 28 unique target candidate ids from 30 explanations.
Gathering desired candidates...
Could not find 6 target candidates with the following stable_ids (first 5):
19721134::span:1078:1084~~19721134::span:1017:1026
14698717::span:322:330~~14698717::span:302:310
17574447::span:110:122~~17574447::span:311:323
10728962::span:32:40~~10728962::span:50:60
11999899::span:962:970~~11999899::span:1112:1121
Found 22/28 desired candidates
Linking explanations to candidates...
Linked 22/30 explanations
Calling babbler...
Flushing all parses from previous explanation set.
Created grammar with 593 rules
30 explanation(s) out of 30 were parseable.
106 parse(s) generated from 30 explanation(s).
82 parse(s) remain (24 parse(s) removed by DuplicateSemanticsFilter).
Note: 23 LFs did not have candidates and therefore could not be filtered.
57 parse(s) remain (2

In [None]:
from snorkel.contrib.babble import Explanation
from snorkel.contrib.babble.utils import link_explanation_candidates

explanations = [
    Explanation(
        label=True,
        condition="""'spotted' is within 3 words to the right of X or Y""",
        candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:426:437"
    ),
    Explanation(
        label=True,
        condition="The moon is full",
        candidate=None
    ),
#     Explanation(
#         label=False,
#         condition="""At least one word right of X or Y is uppercase or lowercase""",
#         candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:426:437"
#     ),
#     Explanation(
#         label=False,
#         condition="""The word 'spotted' is two words right of X or Y""",
#         candidate="1ca2f108-633f-440a-bd6e-d59f7e01316b::span:338:351~~1ca2f108-633f-440a-bd6e-d59f7e01316b::span:474:478"
#     ),    
]
explanations = link_explanation_candidates(explanations, candidates)

In [None]:
from collections import namedtuple
from snorkel.contrib.babble import Babbler
babbler = Babbler(session,
                  mode='text',
                  candidate_class=candidate_class,
                  user_lists={})
babbler.apply(explanations, split=1)
parses = babbler.get_parses(translate=False)

In [None]:
babbler.filtered_analysis()

In [None]:
# babbler.semparser.grammar.print_chart()

In [None]:
# parses = sorted(parses, key=lambda x: x.explanation.name)
# new_parses = []
# seen = set()
# for parse in parses:
#     if parse.explanation.name not in seen:
#         seen.add(parse.explanation.name)
#         new_parses.append(parse)
#         print(parse.explanation.name)
#         print(babbler.semparser.grammar.translate(parse.semantics))
#         print("")
# print(len(new_parses))
# new_parses

In [10]:
%time pipe.label()

Clearing existing...
Running UDF...

Labeled split 0: (6667,31) sparse (nnz = 39146)

Clearing existing...
Running UDF...

Labeled split 1: (773,31) sparse (nnz = 4589)

                              j  Coverage  Overlaps  Conflicts   TP   FP   FN  \
LF_c_d_0                      0  0.020699  0.020699   0.020699    9    7    0   
LF_c_induced_d_0              1  0.002587  0.002587   0.002587    1    1    0   
LF_c_treat_d_0                2  0.050453  0.050453   0.036223    0    0    9   
LF_c_treat_d_wide_2           3  0.102199  0.102199   0.086675    0    0   13   
LF_ctd_marker_c_d_5           4  0.018111  0.018111   0.018111    9    5    0   
LF_ctd_therapy_treat_7        5  0.059508  0.059508   0.053040    0    0    7   
LF_ctd_unspecified_treat_7    6  0.078913  0.078913   0.068564    0    0   13   
LF_d_induced_by_c_1           7  0.050453  0.050453   0.050453   13   26    0   
LF_d_induced_by_c_tight_0     8  0.442432  0.442432   0.442432   78  264    0   
LF_d_induced_by_c_ti

In [None]:
# %time pipe.supervise()

In [None]:
# %time pipe.classify()