In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
config = {
    'domain': 'drink',
    'debug': True,
    'postgres': False,
    'parallelism': 1,
    'splits': [0,1,2],
    'disc_model_class': 'logreg',
    'supervision': 'traditional',
}

In [4]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_drink_debug.db


In [5]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting splits=[0, 1] to splits=[0, 1, 2]
Overwriting disc_model_class=inception_v3 to disc_model_class=logreg
Overwriting epochs=[25, 50, 75] to epochs=[50]
Overwriting step_size=[0.01, 0.001, 0.0001, 1e-05] to step_size=[1e-05]
Overwriting reg_param=[0.0, 0.01, 0.1, 0.25, 0.5] to reg_param=[0.01]
Overwriting decay=[0.9, 0.95, 0.99] to decay=[0.9]
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting decay=0.95 to decay=0.9
Overwriting epochs=50 to epochs=100
Overwriting reg_param=0.1 to reg_param=0.01
Overwriting domain=None to domain=drink
Overwriting tune_b=True to tune_b=False
Overwriting gen_model_search_space=10 to gen_model_search_space=1
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting supervision=generative to supervision=traditional
Overwriting debug=False to debug=True
Overwriting lr=[0.01, 0.001, 0.0001] to lr=[1, 50]
NOTE: --debug=True: modifying parameters...


In [6]:
%time pipe.parse()

Clearing existing...
Running UDF...
Running UDF...
CPU times: user 20.7 s, sys: 860 ms, total: 21.5 s
Wall time: 23.8 s


In [7]:
%time pipe.extract()

Extraction was performed during parse stage.
Candidates [Split 0]: 15711
Candidates [Split 1]: 3377
Candidates [Split 2]: 0
CPU times: user 24 ms, sys: 24 ms, total: 48 ms
Wall time: 27.3 ms


In [8]:
%time pipe.load_gold()

Reading train CSV!
Num HITs unique: 1525
Num HITs total: 3050
Unanimous: 4542
Majority: 2211
Bad: 450
Reading val CSV!
Num HITs unique: 184
Num HITs total: 368
Unanimous: 474
Majority: 318
Bad: 100
AnnotatorLabels created: 5648
AnnotatorLabels created: 634
CPU times: user 1min, sys: 672 ms, total: 1min 1s
Wall time: 1min 1s


In [10]:
#%time pipe.featurize()

In [11]:
%time pipe.collect()

Reading train CSV!
Num HITs unique: 44
Num HITs total: 132
Unanimous: 112
Majority: 54
Bad: 37
Building list of target candidate ids...
Collected 139 unique target candidate ids from 370 explanations.
Gathering desired candidates...
Could not find 139 target candidates with the following stable_ids (first 5):
0:266::bbox:14~~0:266::bbox:7
0:379::bbox:1~~0:379::bbox:5
0:724::bbox:21~~0:724::bbox:11
0:518::bbox:4~~0:518::bbox:7
0:731::bbox:2~~0:731::bbox:1
Found 0/139 desired candidates
Linking explanations to candidates...
Linked 0/370 explanations
Linking candidates...
Building list of target candidate ids...
Collected 139 unique target candidate ids from 370 explanations.
Gathering desired candidates...
Could not find 139 target candidates with the following stable_ids (first 5):
0:266::bbox:14~~0:266::bbox:7
0:379::bbox:1~~0:379::bbox:5
0:724::bbox:21~~0:724::bbox:11
0:518::bbox:4~~0:518::bbox:7
0:731::bbox:2~~0:731::bbox:1
Found 0/139 desired candidates
Linking explanations to candi

In [12]:
%time pipe.label()

In 'traditional' supervision mode...skipping 'label' stage.
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 317 µs


In [13]:
%time pipe.supervise()

In 'traditional' supervision mode...skipping 'supervise' stage.
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 303 µs


In [None]:
config['display_marginals'] = False
config['download_data'] = False
config['disc_model_class'] = 'inception_v3'
    
%time pipe.classify(slim_ws_path='/dfs/scratch0/paroma/slim_new/slim_ws/')

In 'traditional' supervision mode...grabbing candidate and gold label subsets.
Train size: 858
Dev size: 78
Test size: 118
Assuming MSCOCO data is already downloaded and converted (download_data = False).
Starting training over space of 2 configurations

Configuration 0.
Running the following configuration:
dim: 128
dropout: 0.25
lr: 50
max_steps: 1000
rebalance: 0.5
weight_decay: 0.0001
Calling TFSlim train...
Calling TFSlim eval on validation...
> /afs/cs.stanford.edu/u/paroma/snorkel_new/babble_snorkel/snorkel/contrib/babble/pipelines/image_pipeline.py(236)classify()
-> accuracy, precision, recall = scrape_output(output_file)
(Pdb) scrape_output(output_file)
*** UnboundLocalError: local variable 'accuracy' referenced before assignment
