In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/babble/data/')

os.environ['SNORKELDB'] = 'postgres://localhost:5432/babble_test_spouse'

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
from snorkel.models import candidate_subclass

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

In [3]:
candidates = session.query(Spouse).filter(Spouse.split == 1).all()[:100]
print(len(candidates))

100


In [5]:
import os
from mturk_processing import MTurkHelper
helper = MTurkHelper(candidates, num_hits=25)

In [7]:
output_csv_path= os.environ['SNORKELHOME'] + '/tutorials/babble/data/mturk_spouse_01_out.csv'
explanations = helper.postprocess(output_csv_path, candidates=candidates, verbose=False)

Num HITs unique: 25
Num HITs total: 75
Unanimous: 61
Majority: 14
Split: 0
Bad: 25


In [8]:
exp_iterator = iter(explanations)

In [13]:
exp = exp_iterator.next()
from snorkel.viewer import SentenceNgramViewer
print(exp.condition)
sv = SentenceNgramViewer([exp.candidate], session, n_per_page=3, height=150)
sv

The people highlighted here are married to each other.


<IPython.core.display.Javascript object>

In [14]:
for exp in explanations[:10]: print(exp.condition)

No indication of marriage between these two entities.
person1 merely sent a letter to person2.
the given text does not indicate that person1 and person2 are/were married.
person2 is named as person1's wife.
The people highlighted here are married to each other.
the text says "his wife"
nothing in the text indicates that these people are married
No indication of marriage here.
No indication of marriage.
person1 and person2 are acts performing at an event.


In [15]:
from snorkel.contrib.babble import Explanation
exp = Explanation("person1 equals person2", True, name='test_exp')
homemades = [exp]

In [35]:
from snorkel.contrib.babble import Babbler
user_lists = {}
babbler = Babbler(Spouse, explanations, user_lists)
# babbler.apply(parallelism=1)

Created grammar with 356 rules


In [36]:
lfs = babbler.generate_lfs()

136 parses created from 26 out of 211 explanation(s)


In [18]:
babbler.filter_duplicate_semantics()

Filtered to 27 LFs with duplicate semantics filter (10 filtered).


In [19]:
babbler.filter_consistency()

Filtered to 15 LFs with consistency filter (12 filtered).


In [20]:
from snorkel.contrib.babble import sem_to_str

for parse in babbler.parses:
    print(sem_to_str(parse.semantics))

return 1 if any(map(contains('his wife'), [p.text.strip() for p in sentence.phrases])) else 0
return -1 if call((= text(arg2).strip()), 'daughter') else 0
return 1 if any(map(contains('married couple'), [p.text.strip() for p in sentence.phrases])) else 0
return 1 if any(map((= 'married couple'), [p.text.strip() for p in sentence.phrases])) else 0
return -1 if call((= text(arg2).strip()), text(arg1).strip()) else 0
return 1 if True else 0
return -1 if all(map(('.composite_or', ('.eq',), ('.list', ('.string', u'husband'), ('.string', u'wife'), ('.string', u'married'))), ['and',text(arg1).strip(),text(arg2).strip()])) else 0
return -1 if any(map((= text(arg2).strip()), ['daughter','son','child'])) else 0
return -1 if call((>= 1), 0) else 0
return -1 if call((= text(arg1).strip()), 'daughter') else 0
return -1 if call((= text(arg1).strip()), text(arg2).strip()) else 0
return 1 if call(in [p.text.strip() for p in left(arg2)], 'widow') else 0
return 1 if any(map(contains('her husband'), [p.t

In [22]:
babbler.get_explanations()

[Explanation("Explanation5: True, the text says "his wife""),
 Explanation("Explanation17: False, the word "daughter" occurs within three words to the left of person2."),
 Explanation("Explanation21: True, The text states that they are a "married couple""),
 Explanation("Explanation24: False, person1 is just friends with person2"),
 Explanation("Explanation36: True, ITS TRUE"),
 Explanation("Explanation40: False, the word "and" between person1 and person2 is not also accompanied by the words "husband", "wife", or "married" anywhere in the sentence"),
 Explanation("Explanation42: False, "daughter", "son" or "child" occurs to the left of person2"),
 Explanation("Explanation70: False, No proof, it sounds like a news release"),
 Explanation("Explanation108: False, the word "daughter" occurs within three words to the left of person1."),
 Explanation("Explanation116: False, it indicates that person2 is the beau of person1"),
 Explanation("Explanation152: True, The word "widow" appears before

In [21]:
%time babbler.generate_label_matrix(split=1, parallelism=1)

Clearing existing...
Running UDF...

CPU times: user 1min 15s, sys: 898 ms, total: 1min 16s
Wall time: 1min 22s


<2811x15 sparse matrix of type '<type 'numpy.int64'>'
	with 3387 stored elements in Compressed Sparse Row format>

In [None]:
# babbler.load_matrix(session, split=1)

In [23]:
babbler.filter_uniform_signatures()

Filtered to 7 LFs with uniform signatures filter (8 filtered).


In [24]:
babbler.filter_duplicate_signatures()

Filtered to 5 LFs with duplicate signatures filter (2 filtered).


In [25]:
for p in babbler.parses: print(sem_to_str(p.semantics))

return 1 if any(map(contains('his wife'), [p.text.strip() for p in sentence.phrases])) else 0
return 1 if any(map(contains('married couple'), [p.text.strip() for p in sentence.phrases])) else 0
return -1 if call((= text(arg2).strip()), text(arg1).strip()) else 0
return 1 if call(in [p.text.strip() for p in left(arg2)], 'widow') else 0
return 1 if any(map(contains('her husband'), [p.text.strip() for p in sentence.phrases])) else 0


In [26]:
babbler.get_explanations()

[Explanation("Explanation5: True, the text says "his wife""),
 Explanation("Explanation21: True, The text states that they are a "married couple""),
 Explanation("Explanation24: False, person1 is just friends with person2"),
 Explanation("Explanation152: True, The word "widow" appears before person2 and after person 1"),
 Explanation("Explanation163: True, Text says "her husband"")]

In [28]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

In [29]:
babbler.label_matrix.lf_stats(session, labels=L_gold_dev)

Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.
Explanation5_0,0,0.048381,0.003913,0.000356,34,93,0,0,0.267717
Explanation21_0,1,0.001779,0.0,0.0,3,2,0,0,0.6
Explanation24_0,2,0.053362,0.002134,0.002134,0,0,1,148,0.993289
Explanation152_1,3,0.00249,0.000356,0.0,4,3,0,0,0.571429
Explanation163_0,4,0.043757,0.005692,0.001779,29,89,0,0,0.245763


In [None]:
# candidates = session.query(Spouse).filter(Spouse.split == 1).all()
# print(len(candidates))

In [None]:
lf = babbler.lfs[-1]
yes = 0
no = 0
for c in candidates:
    if lf(c):
        yes += 1
    else:
        no += 1
print(yes)
print(no)

In [None]:
def f(c):
    return 1 if (
        c[0].get_span().strip() in [p.text for p in get_sentence_phrases(c[0])] and
        c[1].get_span().strip() in [p.text for p in get_sentence_phrases(c[0])]) else 0

for c in candidates:
    if lf(c) != f(c):
        print(c[0].get_span(), c[1].get_span())
        pprint([p.text for p in get_sentence_phrases(c[0])])
        import pdb; pdb.set_trace()

In [None]:
from pprint import pprint
from snorkel.contrib.babble import *
yes = 0
no = 0
for c in candidates:
    if (c[0].get_span().strip() in [p.text for p in get_sentence_phrases(c[0])] and 
        c[1].get_span().strip() in [p.text for p in get_sentence_phrases(c[0])]):
        yes += 1
    else:
        no += 1
print(yes)
print(no)

In [None]:
import numpy as np
print(np.sum(abs(babbler.label_matrix), 0))