In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'protein',
    'postgres': False,
    'debug': False,
#     'max_docs': 3,
    'babbler_candidate_split': 1,
    'babbler_label_split': 2,
    'supervision': 'majority_vote',
    'do_filter_duplicate_semantics': False, 
    'do_filter_consistency': False, 
    'do_filter_duplicate_signatures': False, 
    'do_filter_uniform_signatures': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=protein
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting supervision=generative to supervision=majority_vote
Overwriting do_filter_duplicate_semantics=True to do_filter_duplicate_semantics=False
Overwriting babbler_label_split=0 to babbler_label_split=2
Overwriting do_filter_consistency=True to do_filter_consistency=False
Overwriting do_filter_uniform_signatures=True to do_filter_uniform_signatures=False
Overwriting do_filter_duplicate_signatures=True to do_filter_duplicate_signatures=False


In [5]:
%time pipe.parse()

Clearing existing...
Running UDF...

Documents: 501
Sentences: 5936
CPU times: user 10 s, sys: 307 ms, total: 10.3 s
Wall time: 1min 54s


In [6]:
%time pipe.extract()

Clearing existing...
Running UDF...

Candidates [Split 0]: 1968
Clearing existing...
Running UDF...

Candidates [Split 1]: 514
Clearing existing...
Running UDF...

Candidates [Split 2]: 479
CPU times: user 9.36 s, sys: 282 ms, total: 9.64 s
Wall time: 9.55 s


In [7]:
%time pipe.load_gold()

AnnotatorLabels created: 0
AnnotatorLabels created: 74
AnnotatorLabels created: 93
CPU times: user 12.2 s, sys: 179 ms, total: 12.4 s
Wall time: 12.4 s


In [8]:
%time pipe.collect()

Skipping malformed or header row 23...
Linking candidates...
Building list of target candidate ids...
Collected 29 unique target candidate ids from 56 explanations.
Gathering desired candidates...
Found 29/29 desired candidates
Linking explanations to candidates...
Linked 56/56 explanations
Calling babbler...
Created grammar with 499 rules
41 explanation(s) out of 56 were parseable.
57 parse(s) generated from 56 explanation(s).
Parsed 57 LFs from 56 explanations.
Filtered to 40 LFs with duplicate semantics filter (17 filtered).
Filtered to 23 LFs with consistency filter (17 filtered).
Clearing existing...
Running UDF...

Filtered to 11 LFs with uniform signatures filter (12 filtered).
Filtered to 9 LFs with duplicate signatures filter (2 filtered).
CPU times: user 8.65 s, sys: 960 ms, total: 9.61 s
Wall time: 9.7 s


In [9]:
%time pipe.label()

Clearing existing...
Running UDF...


Labeled split 0: (1968,9) sparse (nnz = 388)
Clearing existing...
Running UDF...


Labeled split 1: (514,9) sparse (nnz = 119)
                 j  Coverage  Overlaps  Conflicts  TP  FP  FN  TN  \
Explanation0_0   0  0.007782  0.000000   0.000000   0   0   0   0   
Explanation2_0   1  0.001946  0.000000   0.000000   0   0   0   0   
Explanation33_1  2  0.029183  0.000000   0.000000   0   0   0   2   
Explanation4_0   3  0.048638  0.029183   0.029183   0   1   0   0   
Explanation42_0  4  0.005837  0.000000   0.000000   0   2   0   0   
Explanation5_1   5  0.052529  0.029183   0.029183   0   0   1   3   
Explanation50_0  6  0.013619  0.009728   0.009728   0   1   0   0   
Explanation51_1  7  0.015564  0.009728   0.009728   0   0   0   2   
Explanation6_0   8  0.056420  0.003891   0.003891   0   0   0   1   

                 Empirical Acc.  
Explanation0_0              NaN  
Explanation2_0              NaN  
Explanation33_1            1.00  
Explanat

  ac = (tp+tn).astype(float) / (tp+tn+fp+fn)




Labeled split 2: (479,9) sparse (nnz = 73)
CPU times: user 15.8 s, sys: 381 ms, total: 16.2 s
Wall time: 16.4 s


In [10]:
%time pipe.supervise()

Using L_train: <1968x9 sparse matrix of type '<type 'numpy.int64'>'
	with 388 stored elements in Compressed Sparse Row format>
Using L_gold_train: <1968x1 sparse matrix of type '<type 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>
Positive Fraction: 0.0%

Using L_dev: <514x9 sparse matrix of type '<type 'numpy.int64'>'
	with 119 stored elements in Compressed Sparse Row format>
Using L_gold_dev: <514x1 sparse matrix of type '<type 'numpy.int64'>'
	with 74 stored elements in Compressed Sparse Row format>
Positive Fraction: 1.4%

Using L_test: <479x9 sparse matrix of type '<type 'numpy.int64'>'
	with 73 stored elements in Compressed Sparse Row format>
Using L_gold_test: <479x1 sparse matrix of type '<type 'numpy.int64'>'
	with 93 stored elements in Compressed Sparse Row format>
Positive Fraction: 5.0%

Saved 1968 marginals
CPU times: user 127 ms, sys: 12.8 ms, total: 140 ms
Wall time: 133 ms


In [11]:
%time pipe.classify()

### [7.1] Begin training discriminative model
[1] Testing dim = 64, dropout = 2.50e-01, rebalance = 0.00e+00, lr = 1.00e-03


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[reRNN] Training model
[reRNN] n_train=1968  #epochs=20  batch size=128
[reRNN] Epoch 0 (4.08s)	Average loss=0.530405	Dev F1=0.00
[reRNN] Epoch 1 (8.65s)	Average loss=0.163537	Dev F1=0.00
[reRNN] Epoch 2 (13.26s)	Average loss=0.151931	Dev F1=0.00
[reRNN] Epoch 3 (17.92s)	Average loss=0.151665	Dev F1=0.00
[reRNN] Epoch 4 (22.69s)	Average loss=0.161303	Dev F1=0.00
[reRNN] Epoch 5 (27.52s)	Average loss=0.149935	Dev F1=0.00
[reRNN] Epoch 6 (32.30s)	Average loss=0.155079	Dev F1=0.00
[reRNN] Epoch 7 (37.17s)	Average loss=0.155170	Dev F1=0.00
[reRNN] Epoch 8 (42.13s)	Average loss=0.149080	Dev F1=0.00
[reRNN] Epoch 9 (47.04s)	Average loss=0.150795	Dev F1=0.00
[reRNN] Epoch 10 (51.74s)	Average loss=0.147591	Dev F1=0.00
[reRNN] Epoch 11 (56.84s)	Average loss=0.149325	Dev F1=0.00
[reRNN] Epoch 12 (62.00s)	Average loss=0.148992	Dev F1=0.00
[reRNN] Epoch 13 (66.90s)	Average loss=0.148461	Dev F1=0.00
[reRNN] Epoch 14 (71.58s)	Average loss=0.150072	Dev F1=0.00
[reRNN] Epoch 15 (76.10s)	Average loss=0