Hardcode a single candidate

In [1]:
import sys
import os
os.environ['SNORKELHOME'] = '/Users/bradenhancock/snorkel'
sys.path.append(os.environ['SNORKELHOME'])

In [2]:
from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

In [3]:
from snorkel.models import Document, Sentence

print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()
train_candidates = session.query(Spouse).filter(Spouse.split == 0).all()
candidates = train_candidates[387:390]

Documents: 1000
Sentences: 29282


In [4]:
from snorkel.viewer import SentenceNgramViewer

sv = SentenceNgramViewer(candidates, session)
sv

<IPython.core.display.Javascript object>

In [5]:
for i, cand in enumerate(candidates):
    print i, cand
c = candidates[1]

0 Spouse(Span("Daniel Webster", sentence=13399, chars=[63,76], words=[14,15]), Span("Boehner", sentence=13399, chars=[129,135], words=[23,23]))
1 Spouse(Span("Gao Zhisheng", sentence=9944, chars=[29,40], words=[7,8]), Span("Geng", sentence=9944, chars=[59,62], words=[12,12]))
2 Spouse(Span("Ted Cruz", sentence=19561, chars=[126,133], words=[20,21]), Span("Mike Huckabee", sentence=19561, chars=[108,120], words=[17,18]))


In [6]:
from snorkel.semparser import Example

spouse_examples = [
    Example(candidate=candidates[0],
            input="false word votes right arg 1", 
            semantics=('label', False, ('in', ('string', 'votes'), ('right', ('arg', 1)))), 
            denotation=-1),
#     Example(candidate=candidates[1],
#             input="true word wife between arg 0 arg 1 ", 
#             semantics=('label', True, 
#                        ('in', 
#                         ('string', 'wife'), 
#                         ('between', ('arg', 0), ('arg', 1)))), 
#             denotation=1),
#     Example(candidate=candidates[2],
#             input="true word presidential left arg 0 and presidential left arg 1",
#             semantics=('label', True, 
#                            ('and', 
#                                 ('in', ('string', 'presidential'), ('left', ('arg', 0))), 
#                                 ('in', ('string', 'presidential'), ('left', ('arg', 1))))))
]

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Valid inputs

In [7]:
from snorkel.semparser import Rule

# Lexical rules
lexical_rules = [
    # numeric
    Rule('$Num', '0', 0),
    Rule('$Num', '1', 1),
    # string
    Rule('$StrKeyword', 'word', 'string'),
    Rule('$Text', 'votes', 'votes'),
    Rule('$Text', 'wife', 'wife'),
    Rule('$Text', 'presidential', 'presidential'),
    # bool
    Rule('$Bool', 'true', True),
    Rule('$Bool', 'false', False),
    # arg
    Rule('$Arg', 'arg', 'arg'),
    # other
    Rule('$WordLister', 'right', 'right')
]

# Unary rules

# cond_rules = [
#     Rule('$Cond', '$In')
# ]

# Compositional rules

compositional_rules = [
    # Rule('$Bool', '$Bool $Conj $Bool')
    Rule('$Root', '$Bool $In', lambda sems: ('label', sems[0], sems[1])),
    Rule('$String', '$StrKeyword $Text', lambda sems: (sems[0], sems[1])),
    Rule('$In', '$String $WordList', lambda sems: ('in', sems[0], sems[1])),
    Rule('$WordList', '$WordLister $ArgX', lambda sems: (sems[0], sems[1])),
    Rule('$ArgX', '$Arg $Num', lambda sems: (sems[0], sems[1]))
]

snorkel_rules = lexical_rules + compositional_rules

In [8]:
snorkel_ops = {
    'label': lambda x, y: "lambda c: %d if %s else 0" % (1 if x else -1, y),
    'arg': lambda x: "c[%d]" % x,
    'string': lambda x: "'%s'" % x,
    'and': lambda x, y: "%s and %s" % (x, y),
    'or': lambda x, y: "%s or %s" % (x, y),
    'in': lambda x, y: "%s in %s" % (x, y),
    'length': lambda x: "len(%s)" % x,
    'upper': lambda x: "%s.isupper()" % x,
    'lower': lambda x: "%s.islower()" % x,
    'left': lambda x: "get_left_tokens(%s, window=100)" % x,
    'right': lambda x: "get_right_tokens(%s, window=100)" % x,
    'between': lambda x, y: "get_between_tokens((%s, %s))" % (x, y)
}

In [9]:
# "A candidate is False if: the word 'votes' is to the right of arg1."
# false word votes right arg 1
# semantics=('label', False, ('in', ('string', 'votes'), ('right', ('arg', 1))))

In [10]:
from snorkel.semparser import Grammar, Evaluator

snorkel_grammar = Grammar(snorkel_rules)
snorkel_evaluator = Evaluator(snorkel_ops)

for example in spouse_examples:
    parses = snorkel_grammar.parse_input(example.input)
    print
    print('%-12s %s' % ('input', example.input))
    for idx, parse in enumerate(parses):
        print('parse %d' % idx)
        print('\t%s' % parse) 
        sem = parse.semantics
        print('\t'), 
        print(sem)
        LF, LFstring = snorkel_evaluator.evaluate(sem)
        print('\t'),
        print(LFstring)
        print(LF(example.candidate))

Created grammar with 15 rules.
> /Users/bradenhancock/snorkel/snorkel/semparser.py(163)parse_input()
-> if self.start_symbol:
(Pdb) print parses
[<snorkel.semparser.Parse instance at 0x11bdbe7a0>]
(Pdb) print parses[0]
($Root ($Bool false) ($In ($String ($StrKeyword word) ($Text votes)) ($WordList ($WordLister right) ($ArgX ($Arg arg) ($Num 1)))))
(Pdb) c

input        false word votes right arg 1
