In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [2]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'db_name': 'babble_spouse_demo',
    'debug': False,
    'babbler_candidate_split': 1,
    'babbler_label_split': 0,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

Overwriting domain=None to domain=spouse
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99


In [5]:
from snorkel.models import candidate_subclass
from snorkel.contrib.babble import ExplanationIO
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

## Now the real work begins...

In [9]:
candidates = session.query(candidate_class).filter(
    candidate_class.split == config['babbler_candidate_split']).all()

In [10]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidates[:10], strategy='linear', preload=False)

In [11]:
c = bs.next()

In [12]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=150)
sv

<IPython.core.display.Javascript object>

In [16]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'consultant' is to the left of arg 1 or arg 2"
explanation = Explanation(condition, label, candidate=c, name='')

In [32]:
explanation

Explanation("Explanation0: False, 'consultant' is to the left of arg 1")

In [20]:
parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).


In [33]:
conf_matrix_list[0].tn

{Spouse(Span("Joe Friedberg", sentence=34076, chars=[21,33], words=[4,5]), Span("Palmer", sentence=34076, chars=[49,54], words=[11,11]))}

In [29]:
# TBD: (will return a pandas DataFrame)
stats_list[0].precision

In [34]:
bs.commit_lfs()

Added 1 parse(s) to set. (Total # parses = 2)
TODO: add to label_matrix...


In [31]:
# TBD: (will return a csr_AnnotationMatrix)
L_train = bs.get_label_matrix()