In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [2]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'db_name': 'babble_spouse_demo',
    'debug': False,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

Overwriting domain=None to domain=spouse
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99


In [5]:
from snorkel.models import candidate_subclass
from snorkel.contrib.babble import ExplanationIO
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [6]:
%time pipe.parse()

Clearing existing...
Running UDF...
Documents: 2591
Sentences: 67820
CPU times: user 2min 31s, sys: 2.21 s, total: 2min 34s
Wall time: 2min 35s


In [7]:
%time pipe.extract()

Clearing existing...
Running UDF...

Candidates [Split 0]: 23425
Clearing existing...
Running UDF...

Candidates [Split 1]: 2448
Clearing existing...
Running UDF...

Candidates [Split 2]: 1815
CPU times: user 6min 36s, sys: 4.26 s, total: 6min 41s
Wall time: 6min 44s


In [8]:
%time pipe.load_gold()

AnnotatorLabels created: 1230
AnnotatorLabels created: 2448
AnnotatorLabels created: 1815
CPU times: user 2min 6s, sys: 1.25 s, total: 2min 7s
Wall time: 2min 9s


### Collect

Option 1: Load pre-written user_lists and explanations

In [9]:
from tutorials.babble.spouse.spouse_examples import get_user_lists
user_lists = get_user_lists()
print("User lists: {}".format(user_lists.keys()))

User lists: ['known_spouses', 'spouse', 'other', 'family', 'last_names']


In [10]:
candidates = session.query(candidate_class).filter(
    candidate_class.split == config['babbler_candidate_split']).all()

In [11]:
from tutorials.babble.spouse.spouse_examples import get_explanations
explanations = get_explanations(candidates)

Building list of target candidate ids...
Collected 11 unique target candidate ids from 11 explanations.
Gathering desired candidates...
Found 11/11 desired candidates
Linking explanations to candidates...
Linked 11/11 explanations


Option 2: Write your own user_lists and explanations

In [12]:
user_lists = {
    'spouse':  ['spouse', 'wife', 'husband', 'ex-wife', 'ex-husband'],
    'family':  ['father', 'father', 'mother', 'sister', 'sisters', 
                'brother', 'brothers', 'son', 'sons', 'daughter', 'daughters',
                'grandfather', 'grandmother', 'uncle', 'uncles', 'aunt', 'aunts', 
                'cousin', 'cousins'],
}

In [13]:
train_candidates = session.query(candidate_class).filter(
    candidate_class.split == 0).all()

In [14]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(train_candidates[:300], session, n_per_page=3, height=300)
sv

<IPython.core.display.Javascript object>

In [15]:
candidate = sv.get_selected()
print(candidate)

Spouse(Span("Imam Al-Bukhari", sentence=11519, chars=[0,14], words=[0,3]), Span("Prophet Muhammad", sentence=11519, chars=[30,45], words=[6,7]))


In [16]:
candidate_id = candidate.get_stable_id()
print(candidate_id)

ac0fd33c-7132-4d58-8acf-407eb05221a3::span:7107:7121~~ac0fd33c-7132-4d58-8acf-407eb05221a3::span:7137:7152


In [17]:
from snorkel.contrib.babble import Explanation

explanations = [
    Explanation(
        label=False,
        condition="the word 'winner' is right after arg 1",
        candidate=candidate_id),
]

In [18]:
from snorkel.contrib.babble import Babbler

babbler = Babbler(mode='text', 
                  explanations=explanations, 
                  candidate_class=candidate_class, 
                  user_lists=user_lists)

Created grammar with 480 rules


In [19]:
babbler.apply()

1 parses created from 1 out of 1 explanation(s)
Parsed 1 LFs from 1 explanations.
Filtered to 1 LFs with duplicate semantics filter (0 filtered).
Note: 1 LFs did not have candidates and therefore could not be filtered.
Filtered to 1 LFs with consistency filter (0 filtered).
Clearing existing...
Running UDF...

Filtered to 1 LFs with uniform signatures filter (0 filtered).
Filtered to 1 LFs with duplicate signatures filter (0 filtered).


<23425x1 sparse matrix of type '<type 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [20]:
babbler.get_parses(translate=False)[0]

('.root',
 ('.label',
  ('.bool', False),
  ('.call',
   ('.in',
    ('.extract_text',
     ('.right',
      ('.arg', ('.int', 1)),
      ('.string', '.eq'),
      ('.int', 1),
      ('.string', 'words')))),
   ('.string', 'winner'))))

In [21]:
babbler.get_parses(translate=True)[0]

"return -1 if call(in text(right(arg1,'.eq',1,'words')), 'winner') else 0"

In [22]:
lf = babbler.get_lfs()[0]

In [23]:
from snorkel.lf_helpers import test_LF
tp, fp, tn, fn = test_LF(session, lf, split=1, annotator_name='gold')

Scores (Un-adjusted)
Pos. class accuracy: 0.0
Neg. class accuracy: 1.0
Precision            0.0
Recall               0.0
F1                   0.0
----------------------------------------
TP: 0 | FP: 0 | TN: 3 | FN: 0

