In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# DOMAIN = 'test'
# DOMAIN = 'cdr'
# DOMAIN = 'spouse'
DOMAIN = 'bike'

In [3]:
import os

db = {
    'test':   'postgres://localhost:5432/babble_test_unittest',
    'spouse': 'postgres://localhost:5432/babble_test_spouse',
    'cdr':    'postgres://localhost:5432/babble_test_cdr',
    'bike':   'postgres://localhost:5432/babble_test_bike',
}

os.environ['SNORKELDB'] = db[DOMAIN]

In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
%%time

import sys
    
from snorkel.models import candidate_subclass
from snorkel.contrib.babble import link_explanation_candidates

# Extract domain-specific explanations and user_lists
if DOMAIN == 'test':
    sys.path.append(os.path.join(os.environ['SNORKELHOME'], 'test/babble/'))
    import unittest_examples
    user_lists = unittest_examples.get_user_lists()
    explanations = unittest_examples.explanations
    Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
    candidate_class = Spouse
elif DOMAIN == 'spouse':
    import spouse_examples
    user_lists = spouse_examples.get_user_lists()
    user_lists = {}
    explanations = spouse_examples.explanations
    Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
    candidate_class = Spouse
    spouse_tutorial_path = os.path.join(os.environ['SNORKELHOME'], 'tutorials/') 
    sys.path.append(spouse_tutorial_path)    
elif DOMAIN == 'cdr':
    import cdr_examples
    user_lists = cdr_examples.get_user_lists()
    explanations = cdr_examples.explanations
    ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
    candidate_class = ChemicalDisease
elif DOMAIN == 'bike':
    explanations = None # FIXME
    ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
    candidate_class = ChemicalDisease    
else:
    raise Exception("Invalid domain: {}".format(DOMAIN))
    
candidates = session.query(candidate_class).all()
explanations = link_explanation_candidates(explanations, candidates)
print("Domain: {}".format(DOMAIN))
print("Explanations: {}".format(len(explanations)))
print("Candidates: {}".format(len(candidates)))

Building list of target candidate hashes...
Collected 9 target candidate hashes from 9 explanations.
Gathering desired candidates...
Found 9/9 desired candidates
Linking explanations to candidates...
Linked 9/9 explanations
Domain: spouse
Explanations: 9
Candidates: 27766
CPU times: user 18.7 s, sys: 1 s, total: 19.7 s
Wall time: 24.1 s


In [6]:
from snorkel.contrib.babble import Explanation

good = Explanation("'wife' is in the sentence.", True)
bad1 = Explanation("arg 1 is in the sentence.", True)
bad2 = Explanation("arg 2 is in the sentence.", False)
explanations = [good, bad1, bad2]

In [8]:
from snorkel.contrib.babble import Babbler
babbler = Babbler(mode='text', candidate_class=Spouse, 
                  explanations=explanations, user_lists=user_lists)

Created grammar with 360 rules


In [9]:
lfs = babbler.generate_lfs()

3 parses created from 3 out of 3 explanation(s)


In [11]:
def LF_wife_in_sentence(c):
    """A simple example of a labeling function"""
    return 1 if 'wife' in c.get_parent().words else 0

In [18]:
from snorkel.lf_helpers import test_LF
%time tp, fp, tn, fn = test_LF(session, LF_wife_in_sentence, split=0, annotator_name='gold')

Scores (Un-adjusted)
Pos. class accuracy: 0.0
Neg. class accuracy: 0.0
Precision            0.0
Recall               0.0
F1                   0.0
----------------------------------------
TP: 0 | FP: 0 | TN: 0 | FN: 0

CPU times: user 29.2 s, sys: 1.07 s, total: 30.2 s
Wall time: 38.6 s


In [None]:
babbler.filter_consistency()

In [None]:
from snorkel.contrib.babble import sem_to_str
[sem_to_str(p.semantics) for p in babbler.parses[:10]]

In [None]:
L_dev = babbler.generate_label_matrix(split=1)
# L_dev = babbler.load_matrix(session,split=1)

In [None]:
L_dev

In [None]:
babbler.filter_uniform_signatures()

In [None]:
babbler.filter_duplicate_signatures()

In [None]:
L_dev.lf_stats(session)

In [None]:
import numpy as np
np.sum(L_dev, axis=0)

In [None]:
caught = set(L_dev[:,1].nonzero()[0])
for i in range(2811):
    if i not in caught:
        print(i)

In [None]:
from pprint import pprint
from snorkel.contrib.babble.text import get_sentence_phrases
pprint([p.words for p in get_sentence_phrases(L_dev.get_candidate(session, 843)[0],n_max=7)])

In [None]:
dev_candidates = session.query(candidate_class).filter(Spouse.split == 1).all()
print(len(dev_candidates))

In [None]:
# import utils

# # Link examples with their corresponding candidates
# examples = utils.link_example_candidates(examples, dev_candidates)

In [None]:
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

In [None]:
spouse_label_path = os.path.join(os.environ['SNORKELHOME'], 'tutorials/intro/data/gold_labels.tsv') 

import intro

%time intro.load_external_labels(session, Spouse, annotator_name='gold', path=spouse_label_path)

In [None]:
# import bz2
# import os 

# spouses_pickle_path = os.path.join(os.environ['SNORKELHOME'], 
#         'tutorials/intro/data/spouses_dbpedia.csv.bz2') 

# def strip_special(s):
#     return ''.join(c for c in s if ord(c) < 128)

# def last_name(s):
#     name_parts = s.split(' ')
#     return name_parts[-1] if len(name_parts) > 1 else None  

# # Read in known spouse pairs and save as set of tuples
# with bz2.BZ2File(spouses_pickle_path, 'rb') as f:
#     known_spouses = set(
#         tuple(strip_special(x).strip().split(',')) for x in f.readlines()
#     )
# # Last name pairs for known spouses
# last_names = set([(last_name(x), last_name(y)) for x, y in known_spouses if last_name(x) and last_name(y)])

# def LF_distant_supervision(c):
#     p1, p2 = c.person1.get_span(), c.person2.get_span()
#     return 1 if (p1, p2) in known_spouses or (p2, p1) in known_spouses else 0

# def LF_distant_supervision_last_names(c):
#     p1, p2 = c.person1.get_span(), c.person2.get_span()
#     p1n, p2n = last_name(p1), last_name(p2)
#     return 1 if (p1 != p2) and ((p1n, p2n) in last_names or (p2n, p1n) in last_names) else 0

In [None]:
# candidate_subset = []
# for c in dev_candidates:
#     if hash(c) == -2597662937532403956:
#         break

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(dev_candidates[:300], session)
sv

In [None]:
married = ['husband', 'wife', 'spouse', 'marriage', 'married']

In [None]:
def LF_neighbors(c):
    return -1 if (len(c[0].get_attrib_tokens()) == 1 and 
                  len(c[1].get_attrib_tokens()) == 1 and 
                  not set(married).intersection(c.get_parent().words)) else 0

In [None]:
labeled = []
for c in session.query(Spouse).filter(Spouse.split == 1).all():
    if LF_neighbors(c) != 0:
        labeled.append(c)
print("Number labeled:", len(labeled))

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(fn, session)
sv

In [None]:
print(len(dev_candidates))

In [None]:
from snorkel.annotations import load_gold_labels
test_labels     = load_gold_labels(session, annotator_name='gold', split=1)
test_labels

In [None]:
from snorkel.annotations import load_gold_labels
from snorkel.models import Candidate
from snorkel.learning.utils import MentionScorer
import numpy as np

lf = LF_neighbors

dev_candidates = session.query(Candidate).filter(Candidate.split == 1).all()
test_labels     = load_gold_labels(session, annotator_name='gold', split=1)
scorer          = MentionScorer(dev_candidates, test_labels)
test_marginals  = np.array([0.5 * (lf(c) + 1) for c in dev_candidates])
scorer.score(test_marginals, set_unlabeled_as_neg=False, set_at_thresh_as_neg=False)

In [None]:
print(len(test_candidates))
print(test_labels.nnz)
max(test_marginals)

In [None]:
test_labels

In [None]:
from snorkel.contrib.babble import Explanation

explanations = [
    # Tuple
    Explanation(
        condition="label True because the pair (arg 1, arg 2) is the same as the tuple ('foo', 'bar')",
        candidate=('foo', 'bar'),
        label=1,
        semantics=('.root', ('.label', ('.bool', True), ('.call', ('.eq', ('.tuple', ('.list', ('.string', u'foo'), ('.string', u'bar')))), ('.tuple', ('.list', ('.arg_to_string', ('.arg', ('.int', 1))), ('.arg_to_string', ('.arg', ('.int', 2))))))))),    
]

In [None]:
from snorkel.contrib.babble import SemanticParser

%time sp = SemanticParser(candidate_class, user_lists, beam_width=10, top_k=-1)

In [None]:
%time LFs = sp.parse_and_evaluate(explanations,\
                                  show_everything=True,\
                                  show_nothing=False,\
                                  show_explanation=True,\
                                  show_candidate=False,\
                                  show_sentence=False,\
                                  show_parse=True,\
                                  show_semantics=True,\
                                  show_correct=False,\
                                  show_passing=True,\
                                  show_failing=True,\
                                  pseudo_python=True,\
                                  remove_paren=False,\
                                  paraphrases=False,\
                                  only=[])
sp.results

In [None]:
import numpy as np
print np.sum(sp.results, axis=0)

In [None]:
sp.grammar.print_chart(nested=True)

In [None]:
# sp.grammar.print_grammar()

In [None]:
# # Make mini-corpus for test

# import os

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import Document, candidate_subclass
# from snorkel.parser import TSVDocPreprocessor, CorpusParser
# from snorkel.parser.spacy_parser import Spacy
# from snorkel.candidates import Ngrams, CandidateExtractor
# from snorkel.matchers import PersonMatcher
# from snorkel.contrib.babble import SemanticParser, Example

# test_article_path = os.environ['SNORKELHOME'] + '/test/babble/test_article.tsv'
# doc_preprocessor = TSVDocPreprocessor(test_article_path)
# corpus_parser = CorpusParser(parser=Spacy())
# corpus_parser.apply(doc_preprocessor)
# Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
# ngrams         = Ngrams(n_max=2)
# person_matcher = PersonMatcher(longest_match_only=True)
# cand_extractor = CandidateExtractor(Spouse, [ngrams, ngrams], [person_matcher, person_matcher], symmetric_relations=False)
# docs = session.query(Document).order_by(Document.name).all()
# sents = [s for doc in docs for s in doc.sentences]
# cand_extractor.apply(sents, split=0)

In [None]:
# candidates = session.query(Spouse).all()
# for c in candidates:
#     print(c)