# Table Tutorial

## Part I: Preprocessing

In [1]:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading the `Corpus`

First, we will load and pre-process the corpus, storing it for convenience in a `Corpus` object

### Configuring a DocParser & ContextParser

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=101)

In [4]:
%time corpus = cp.parse_corpus(name='Hardware Training', session=session)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67




68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
CPU times: user 5min 3s, sys: 12.9 s, total: 5min 16s
Wall time: 7min 33s


### Pre-processing & loading the `Corpus`

Finally, we'll put this all together using a `CorpusParser` object, which will execute the parsers and store the results as a `Corpus`:

In [14]:
for d in corpus.documents[:5]: print d

Document SGSTS13702-1
Document SIEMS01215-1
Document TKCGS00622-1
Document RECTS01158-1
Document MOTOS04796-1


In [6]:
doc = corpus.documents[0]
print doc
phrase = doc.phrases[0]
print phrase
print phrase.words
print phrase.poses

Document SGSTS13702-1
Phrase(Document SGSTS13702-1, 0, u'SMALL SIGNAL NPN TRANSISTORS')
[u'SMALL', u'SIGNAL', u'NPN', u'TRANSISTORS']
[u'NNP', u'NNP', u'NNP', u'NNS']


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [7]:
session.add(corpus)
session.commit()

### Loading the `Corpus`
If the corpus has already been parsed, load it here:

In [15]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
corpus

Corpus (Hardware Training)

In [None]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [18]:
from snorkel.candidates import OmniNgrams

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)

In [19]:
from utils import load_extended_parts_dict

filename='data/hardware/gold_all.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

Loaded 5331 part numbers.


In [20]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
# temp_matcher = RegexMatchSpan(rgx=ur'[\-\u2010\u2011\u2012\u2013\u2014\u2015\u2212]\s?[5-7][05]')
temp_matcher = RegexMatchSpan(rgx=ur'-\s?[5-7][05]')

In [21]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [omni_ngrams, omni_ngrams], [parts_matcher, temp_matcher])
# ce = CandidateExtractor(Temp, [ngrams], [temp_matcher])

In [22]:
%time c = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "Number of candidates:", len(c)


CPU times: user 47.7 s, sys: 913 ms, total: 48.6 s
Wall time: 48.7 s
Number of candidates: 646


In [None]:
# session.rollback()
# session.delete(c)
# session.commit()

In [23]:
for cand in c[:10]:
    print cand

Part_Temp(Span("BC547C", parent=82184, chars=[9,14], words=[2,2]), Span("-65", parent=124033, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC143ZE", parent=125634, chars=[72,79], words=[8,8]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC124XE", parent=125634, chars=[81,88], words=[9,9]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC144EE", parent=125634, chars=[18,25], words=[2,2]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC114TE", parent=125634, chars=[36,43], words=[4,4]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC123EE", parent=125634, chars=[54,61], words=[6,6]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC123JE", parent=83605, chars=[12,19], words=[2,2]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_Temp(Span("DTC124EE", parent=125634, chars=[9,16], words=[1,1]), Span("-55", parent=125458, chars=[0,2], words=[0,0]))
Part_T

### Saving the extracted candidates

In [24]:
session.add(c)
session.commit()

### Reloading the candidates

In [25]:
from snorkel.models import CandidateSet
c = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
c

Candidate Set (Hardware Training Candidates)

In [None]:
# Make labels gold, candidates gold

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [None]:
train = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Development Candidates').one()

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

In [None]:
F_train

In [None]:
unicode(F_train.get_candidate(0))

In [None]:
F_train.get_key(0)

In [None]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

In [28]:
from snorkel.lf_helpers import get_right_tokens, get_left_tokens, contains_token
from snorkel.lf_helpers import get_cell_ngrams
cand = c[0]
print cand
print get_left_tokens(cand)
print get_right_tokens(cand)
print get_left_tokens(cand[0])
print get_right_tokens(cand[0])
print get_left_tokens(cand[1])
print get_right_tokens(cand[1])
print cand[1]
print get_cell_ngrams(cand[1], n_max=1, case_sensitive=True)
print get_neighborhood()

Part_Temp(Span("BC547C", parent=82184, chars=[9,14], words=[2,2]), Span("-65", parent=124033, chars=[0,2], words=[0,0]))
[u'bc547b', u'/']
[u'to', u'150']
[u'bc547b', u'/']
[]
[]
[u'to', u'150']
Span("-65", parent=124033, chars=[0,2], words=[0,0])
[u'-65', u'to', u'150']


In [None]:
from snorkel.lf_helpers import get_left_tokens, get_right_tokens, contains_token

LFs = []

def LF_to_range(c):
    return 1 if 'to' in get_right_tokens(c) else 0
LFs.append(LF_to_range)

def LF_tilde_range(c):
    return 1 if '~' in get_right_tokens(c) else 0
LFs.append(LF_tilde_range)

def LF_contains_minus(c):
    return 1 if contains_token(c, '-') or contains_token(c,'-50') else -1
LFs.append(LF_contains_minus)

# def LF_storage(m):
#     return 1 if 'storage' in m.aligned_ngrams('words') else -1
# LFs.append(LF_storage)

# def LF_tstg(m):
#     return 1 if 'tstg' in m.aligned_ngrams('words') else -1
# LFs.append(LF_tstg)

# def LF_tj(m):
#     return 1 if 'tj' in m.aligned_ngrams('words') else -1
# LFs.append(LF_tj)

# def LF_temperature(m):
#     return 1 if 'temperature' in m.aligned_ngrams('words') else -1
# LFs.append(LF_temperature)

# def LF_celsius(m):
#     return 1 if 'c' in m.aligned_ngrams('words') else -1
# LFs.append(LF_celsius)

# def LF_max(m):
#     return 1 if 'max' in m.aligned_ngrams('words') else 0
# LFs.append(LF_max)

# def LF_min(m):
#     return 1 if 'min' in m.aligned_ngrams('words') else 0
# LFs.append(LF_min)

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

In [None]:
# session.rollback()
# session.delete(L_train)
# session.commit()

In [None]:
L_train.lf_stats()

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)

In [None]:
gen_model.save(session, 'Generative Params')

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

In [None]:
disc_model.w.shape

In [None]:
%time disc_model.save(session, "Discriminative Params")

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)

In [None]:
L_dev = label_manager.load(session, dev, "CDR Development Labels -- Gold")

In [None]:
gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates -- Gold').one()

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)