# Table Tutorial

## Preprocessing

In [1]:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parse the `Corpus`

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=101)

In [None]:
%time corpus = cp.parse_corpus(name='Hardware Training', session=session)

In [6]:
for doc in corpus.documents[:5]: print doc

Document SCSCS02175-1
Document TWSCS04757-1
Document BC182
Document WEITS00252-1
Document PHGLS20126-1


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [7]:
session.add(corpus)
session.commit()

### Reloading the `Corpus`
If the corpus has already been parsed, load it here:

In [8]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
print corpus
print "%s contains %d Documents" % (corpus.name, len(corpus))

Corpus (Hardware Training)
Hardware Training contains 100 Documents


## Extract the `Candidates`

In [9]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [10]:
from snorkel.candidates import OmniNgrams

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)

In [11]:
from hardware_utils import load_extended_parts_dict

filename='data/hardware/gold_all.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

Loaded 5331 part numbers.


In [12]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=ur'-\s?[5-7][05]')

In [13]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [omni_ngrams, omni_ngrams], [parts_matcher, temp_matcher])

In [None]:
%time candidates = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "Number of candidates:", len(candidates)

In [None]:
# session.rollback()
# session.delete(c)
# session.commit()

In [16]:
for c in candidates[:5]:
    print c

Part_Temp(Span("BC548", parent=9040, chars=[16,20], words=[4,4]), Span("-55", parent=124348, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC547", parent=9040, chars=[8,12], words=[2,2]), Span("-55", parent=124348, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC817", parent=9358, chars=[34,38], words=[5,5]), Span("- 50", parent=85151, chars=[0,3], words=[0,1]))
Part_Temp(Span("BC817", parent=9358, chars=[34,38], words=[5,5]), Span("-50", parent=127064, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC807-40", parent=88043, chars=[30,37], words=[8,8]), Span("-50", parent=129202, chars=[0,2], words=[0,0]))


### Saving the extracted candidates

In [17]:
session.add(candidates)
session.commit()

### Reloading the candidates

In [18]:
from snorkel.models import CandidateSet
candidates = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
print candidates
print "%s contains %d Candidates" % (candidates.name, len(candidates))

Candidate Set (Hardware Training Candidates)
Hardware Training Candidates contains 646 Candidates


## Prepare Gold Data

In [None]:
# Make labels gold, candidates gold

## Generating Features

In [19]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()

In [20]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

### Creating feature matrix

In [21]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Loading sparse Feature matrix...
CPU times: user 21.8 s, sys: 565 ms, total: 22.4 s
Wall time: 22.3 s


### Reloading feature matrix

In [22]:
%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 145 ms, sys: 1.74 ms, total: 147 ms
Wall time: 146 ms


In [23]:
F_train

<646x1714 sparse matrix of type '<type 'numpy.float64'>'
	with 10918 stored elements in Compressed Sparse Row format>

In [24]:
F_train.get_candidate(0)

Part_Temp(Span("BC548", parent=9040, chars=[16,20], words=[4,4]), Span("-55", parent=124348, chars=[0,2], words=[0,0]))

In [25]:
F_train.get_key(0)

AnnotationKey (TDL_INV_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])

In [26]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(0)): print f

AnnotationKey (TDL_INV_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])
AnnotationKey (TDL_INV_LEMMA:BETWEEN-MENTION-and-MENTION[bc546])
AnnotationKey (TDL_INV_LEMMA:BETWEEN-MENTION-and-MENTION[bc547])
AnnotationKey (TDL_INV_LEMMA:BETWEEN-MENTION-and-MENTION[bc546 bc547])
AnnotationKey (TDL_INV_DEP_LABEL:BETWEEN-MENTION-and-MENTION[ROOT])
AnnotationKey (TDL_INV_DEP_LABEL:BETWEEN-MENTION-and-MENTION[dep])
AnnotationKey (TDL_INV_DEP_LABEL:BETWEEN-MENTION-and-MENTION[ROOT dep])
AnnotationKey (TDL_INV_DEP_LABEL|LEMMA:BETWEEN-MENTION-and-MENTION[ROOT|bc546])
AnnotationKey (TDL_INV_DEP_LABEL|LEMMA:BETWEEN-MENTION-and-MENTION[dep|bc547])
AnnotationKey (TDL_INV_DEP_LABEL|LEMMA:BETWEEN-MENTION-and-MENTION[ROOT|bc546 dep|bc547])
AnnotationKey (TDL_INV_LEMMA:SEQ-BETWEEN[/])
AnnotationKey (TDL_INV_LEMMA:SEQ-BETWEEN[bc547])
AnnotationKey (TDL_INV_LEMMA:SEQ-BETWEEN[/ bc547])
AnnotationKey (TDL_INV_LEMMA:SEQ-BETWEEN[bc547 /])
AnnotationKey (TDL_INV_LEMMA:SEQ-BETWEEN[/ bc547 /])
AnnotationKey (TDL_

## Applying LFs

In [27]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

In [28]:
# TEMP
# from snorkel import SnorkelSession
# session = SnorkelSession()
# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
# from snorkel.models import CandidateSet
# candidates = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
# print candidates
# train = candidates

In [29]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams, contains_token
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
cand = train[0]
print cand
print cand[1].parent.cell.text
print get_left_ngrams(cand[1])
print get_right_ngrams(cand[1])
print get_phrase_ngrams(cand[1])
print get_cell_ngrams(cand[1])
print get_neighbor_cell_ngrams(cand[1])
print get_row_ngrams(cand[1])
print get_col_ngrams(cand[1])
print get_aligned_ngrams(cand[1])

Part_Temp(Span("BC548", parent=9040, chars=[16,20], words=[4,4]), Span("-55", parent=124348, chars=[0,2], words=[0,0]))
150,-55~150
[]
[u'~', u'150']
[u'~', u'150']
[u'150', u',', u'-55', u'~', u'150']
[u'625', u't', u'j', u',', u't', u'stg']
[u'junction', u',', u'storage', u'temperature', u't', u'j', u',', u't', u'stg', u'?', u'c']
[u'ratings', u'80', u'50', u'30', u'65', u'45', u'30', u'6', u'100', u'625']
[u'junction', u',', u'storage', u'temperature', u't', u'j', u',', u't', u'stg', u'?', u'c', u'ratings', u'80', u'50', u'30', u'65', u'45', u'30', u'6', u'100', u'625']


In [30]:
from snorkel.lf_helpers import *

LFs = []

def LF_to_range(c):
    return 1 if 'to' in get_right_ngrams(c) else 0
LFs.append(LF_to_range)

def LF_tilde_range(c):
    return 1 if '~' in get_right_ngrams(c) else 0
LFs.append(LF_tilde_range)

def LF_through_range(c):
    return 1 if set(['through','thru']).intersection(set(get_right_ngrams(c))) > 0 else 0
LFs.append(LF_through_range)

def LF_contains_minus(c):
    return 1 if contains_token(c, '-') or contains_token(c,'-50') else -1
LFs.append(LF_contains_minus)

def LF_storage(c):
    return 1 if 'storage' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_storage)

def LF_tstg(c):
    return 1 if 'tstg' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tstg)

def LF_tj(c):
    return 1 if 'tj' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tj)

def LF_temperature(c):
    return 1 if 'temperature' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_temperature)

def LF_celsius(c):
    return 1 if 'c' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_celsius)

def LF_max(c):
    return 1 if 'max' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_max)

def LF_min(c):
    return 1 if 'min' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_min)

In [35]:
%time L_train = label_manager.create(session, train, 'LF Labels2', f=LFs)
L_train


Loading sparse Label matrix...
CPU times: user 10.6 s, sys: 480 ms, total: 11 s
Wall time: 10.8 s


<646x11 sparse matrix of type '<type 'numpy.float64'>'
	with 4603 stored elements in Compressed Sparse Row format>

In [37]:
L_train.lf_stats()

Unnamed: 0,conflicts,coverage,j,overlaps
LF_to_range,0.236842,0.236842,0,0.236842
LF_tilde_range,0.040248,0.040248,1,0.040248
LF_through_range,0.978328,0.978328,2,0.978328
LF_contains_minus,0.978328,0.978328,3,0.978328
LF_storage,0.978328,0.978328,4,0.978328
LF_tstg,0.978328,0.978328,5,0.978328
LF_tj,0.978328,0.978328,6,0.978328
LF_temperature,0.978328,0.978328,7,0.978328
LF_celsius,0.978328,0.978328,8,0.978328
LF_max,0.0,0.0,9,0.0


## Train Generative Model

In [38]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



Training marginals (!= 0.5):	646
Features:			11
Begin training for rate=1e-05, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.913727
	Learning epoch = 250	Gradient mag. = 0.963354
	Learning epoch = 500	Gradient mag. = 0.964895
	Learning epoch = 750	Gradient mag. = 0.966439
	Learning epoch = 1000	Gradient mag. = 0.967987
	Learning epoch = 1250	Gradient mag. = 0.969537
	Learning epoch = 1500	Gradient mag. = 0.971091
	Learning epoch = 1750	Gradient mag. = 0.972648
	Learning epoch = 2000	Gradient mag. = 0.974208
	Learning epoch = 2250	Gradient mag. = 0.975771
	Learning epoch = 2500	Gradient mag. = 0.977337
	Learning epoch = 2750	Gradient mag. = 0.978907
Final gradient magnitude for rate=1e-05, mu=1e-06: 0.980


In [39]:
gen_model.save(session, 'Generative Params')

In [40]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [41]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

Training marginals (!= 0.5):	632
Features:			1714
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 438.069018	Gradient magnitude = 441.369984
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 114.074824	Gradient magnitude = 9.462703
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 109.571442	Gradient magnitude = 5.597344
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 107.721648	Gradient magnitude = 4.271160
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 106.637602	Gradient magnitude = 3.614356
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 105.896542	Gradient magnitude = 3.221343
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 105.347617	Gradient magnitude = 2.956399
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 104.921375	Gradient magnitude = 2.763498
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 104.580079	Gradient magnitude = 2.615856
	Learning epoch = 900	Step s

In [42]:
disc_model.w.shape

(1714,)

In [43]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 511 ms, sys: 21.6 ms, total: 532 ms
Wall time: 533 ms


## Assess Performance on Dev Set

In [45]:
# Temporarily test on the training set:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()

In [46]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)


Loading sparse Feature matrix...
CPU times: user 15.9 s, sys: 574 ms, total: 16.5 s
Wall time: 16.1 s


In [None]:
L_dev = label_manager.load(session, dev, "CDR Development Labels -- Gold")

In [None]:
gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates -- Gold').one()

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)