# Table Tutorial

## Part I: Preprocessing

In [None]:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the `Corpus`

First, we will load and pre-process the corpus, storing it for convenience in a `Corpus` object

### Configuring a DocParser & ContextParser

In [None]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=101)

In [None]:
%time corpus = cp.parse_corpus(name='Hardware Training', session=session)

### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [None]:
session.add(corpus)
session.commit()

### Loading the `Corpus`
If the corpus has already been parsed, load it here:

In [46]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
print corpus
print "%s contains %d Documents" % (corpus.name, len(corpus))

Corpus (Hardware Training)
Hardware Training contains 100 Documents


In [47]:
doc = corpus.documents[0]
print doc
phrase = doc.phrases[0]
print phrase
print phrase.words
print phrase.pos_tags

Document BC337
Phrase(Document BC337, 0, u'BC337 BC338 - NPN Epitaxial Silicon Transistor')
[u'BC337', u'BC338', u'-', u'NPN', u'Epitaxial', u'Silicon', u'Transistor']
[u'NN', u'NN', u':', u'NNP', u'NNP', u'NNP', u'NNP']


In [None]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [None]:
from snorkel.candidates import OmniNgrams

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)

In [None]:
from hardware_utils import load_extended_parts_dict

filename='data/hardware/gold_all.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

In [None]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=ur'-\s?[5-7][05]')

In [None]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [omni_ngrams, omni_ngrams], [parts_matcher, temp_matcher])

In [None]:
%time c = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "Number of candidates:", len(c)

In [None]:
# session.rollback()
# session.delete(c)
# session.commit()

In [None]:
for cand in c[:5]:
    print cand

### Saving the extracted candidates

In [None]:
session.add(c)
session.commit()

### Reloading the candidates

In [48]:
from snorkel.models import CandidateSet
candidates = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
print candidates
print "%s contains %d Candidates" % (candidates.name, len(candidates))

Candidate Set (Hardware Training Candidates)
Hardware Training Candidates contains 646 Candidates


In [49]:
# Make labels gold, candidates gold

In [50]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [51]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Development Candidates').one()

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

### Reloading the feature manager

In [53]:
%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 237 ms, sys: 8.09 ms, total: 245 ms
Wall time: 247 ms


In [54]:
F_train

<646x1714 sparse matrix of type '<type 'numpy.float64'>'
	with 10918 stored elements in Compressed Sparse Row format>

In [55]:
F_train.get_candidate(0)

Part_Temp(Span("BC338", parent=1016, chars=[6,10], words=[1,1]), Span("-55", parent=93265, chars=[0,2], words=[0,0]))

In [56]:
F_train.get_key(0)

AnnotationKey (TDL_INV_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])

In [57]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

In [4]:
# TEMP
from snorkel import SnorkelSession
session = SnorkelSession()
from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
from snorkel.models import CandidateSet
candidates = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Training Candidates').one()
print candidates
train = candidates

Candidate Set (Hardware Training Candidates)


In [40]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams, contains_token
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
cand = train[0]
print cand
print cand[1].parent.cell.text
print get_left_ngrams(cand[1])
print get_right_ngrams(cand[1])
print get_phrase_ngrams(cand[1])
print get_cell_ngrams(cand[1])
print get_neighbor_cell_ngrams(cand[1])
print get_row_ngrams(cand[1])
print get_col_ngrams(cand[1])
print get_aligned_ngrams(cand[1])

Part_Temp(Span("BC338", parent=1016, chars=[6,10], words=[1,1]), Span("-55", parent=93265, chars=[0,2], words=[0,0]))
-55 to 150
[]
[u'to', u'150']
[u'to', u'150']
[u'-55', u'to', u'150']
[u'150', u'storage', u'temperature', u'?', u'c']
[u't', u'stg', u'storage', u'temperature', u'?', u'c']
[u'value', u'bc337', u'bc337', u'5', u'800', u'150']
[u't', u'stg', u'storage', u'temperature', u'?', u'c', u'value', u'bc337', u'bc337', u'5', u'800', u'150']


In [44]:
from snorkel.lf_helpers import *

LFs = []

def LF_to_range(c):
    return 1 if 'to' in get_right_ngrams(c) else 0
LFs.append(LF_to_range)

def LF_tilde_range(c):
    return 1 if '~' in get_right_ngrams(c) else 0
LFs.append(LF_tilde_range)

def LF_through_range(c):
    return 1 if set(['through','thru']).intersection(set(get_right_ngrams(c))) > 0 else 0
LFs.append(LF_through_range)

def LF_contains_minus(c):
    return 1 if contains_token(c, '-') or contains_token(c,'-50') else -1
LFs.append(LF_contains_minus)

def LF_storage(c):
    return 1 if 'storage' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_storage)

def LF_tstg(c):
    return 1 if 'tstg' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tstg)

def LF_tj(c):
    return 1 if 'tj' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tj)

def LF_temperature(c):
    return 1 if 'temperature' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_temperature)

def LF_celsius(c):
    return 1 if 'c' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_celsius)

def LF_max(c):
    return 1 if 'max' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_max)

def LF_min(c):
    return 1 if 'min' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_min)

In [58]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

ValueError: AnnotationKeySet with name LF Labels already exists in the database. Please specify a new name.

In [None]:
# session.rollback()
# session.delete(L_train)
# session.commit()

In [59]:
L_train.lf_stats()

NameError: name 'L_train' is not defined

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)

In [None]:
gen_model.save(session, 'Generative Params')

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

In [None]:
disc_model.w.shape

In [None]:
%time disc_model.save(session, "Discriminative Params")

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)

In [None]:
L_dev = label_manager.load(session, dev, "CDR Development Labels -- Gold")

In [None]:
gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'CDR Development Candidates -- Gold').one()

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)