## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 28.9 s, sys: 1.08 s, total: 30 s
Wall time: 30.7 s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 651 ms, sys: 27.1 ms, total: 678 ms
Wall time: 843 ms


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

Training marginals (!= 0.5):	11549
Features:			12
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.318576
	Learning epoch = 250	Gradient mag. = 0.358357
	Learning epoch = 500	Gradient mag. = 0.386901
	Learning epoch = 750	Gradient mag. = 0.430373
	Learning epoch = 1000	Gradient mag. = 0.482670
	Learning epoch = 1250	Gradient mag. = 0.537971
	Learning epoch = 1500	Gradient mag. = 0.579665
	Learning epoch = 1750	Gradient mag. = 0.582700
	Learning epoch = 2000	Gradient mag. = 0.540290
	Learning epoch = 2250	Gradient mag. = 0.505575
	Learning epoch = 2500	Gradient mag. = 0.399090
	Learning epoch = 2750	Gradient mag. = 0.315655
	Learning epoch = 3000	Gradient mag. = 0.270960
	Learning epoch = 3250	Gradient mag. = 0.241593
	Learning epoch = 3500	Gradient mag. = 0.219134
	Learning epoch = 3750	Gradient mag. = 0.200601
	Learning epoch = 4000	Gradient mag. = 0.184801
	Learning epoch = 4250	Gradient mag. = 0.171126
	Learning epoch = 4500	Gradient mag. = 0.159186
	Lear

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

In [27]:
from snorkel.learning import odds_to_prob
print LFs
print gen_model.w
# print odds_to_prob(gen_model.w)

[ 6.51630926  6.57825164  9.26448389  9.99912989  0.29148396 -0.78206868
  1.29662605  9.99997929  1.00617436  8.3852786   0.95328618  9.95273676]


## Train Discriminative Model

In [11]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

Training marginals (!= 0.5):	11549
Features:			19955
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 8005.156788	Gradient magnitude = 16280.663070
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 23.203717	Gradient magnitude = 13.645793
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 10.004246	Gradient magnitude = 8.902445
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 6.665091	Gradient magnitude = 4.842081
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 5.655129	Gradient magnitude = 2.793048
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 5.325653	Gradient magnitude = 1.826740
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 5.181452	Gradient magnitude = 1.364277
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 5.102499	Gradient magnitude = 1.108010
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 5.052816	Gradient magnitude = 0.949504
	Learning epoch = 900	Step size = 0

In [12]:
disc_model.w.shape

(19955,)

In [13]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 6.38 s, sys: 287 ms, total: 6.67 s
Wall time: 6.79 s


## Assess Performance on Development Set

In [14]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [15]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 17.5 s, sys: 709 ms, total: 18.2 s
Wall time: 18.6 s


In [16]:
L_dev = label_manager.load(session, dev, "Hardware Development Labels -- Gold")

In [17]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [18]:
F_dev.shape

(3163, 19955)

In [19]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.767955801105
Neg. class accuracy: 0.774723313102
Corpus Precision 0.306
Corpus Recall    0.768
Corpus F1        0.437
----------------------------------------
TP: 278 | FP: 631 | TN: 2170 | FN: 84

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.767955801105
Neg. class accuracy: 0.774723313102
Corpus Precision 0.306
Corpus Recall    0.768
Corpus F1        0.437
----------------------------------------
TP: 278 | FP: 631 | TN: 2170 | FN: 84





In [20]:
def error_analysis(c):
    print "Doc: %s" % c.part.parent.document
    print "------------"
    print "Part:"
    print c.part
    table_info(c.part)
    print "------------"
    print "Temp:"
    print c.temp
    table_info(c.temp)

def table_info(span):
    print "Table: %s" % span.parent.table
    if span.parent.cell:
        print "Row: %s" % span.parent.row_num
        print "Col: %s" % span.parent.col_num
    print "Phrase: %s" % span.parent
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [26]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
for c in list(fp):
    print c.part.get_span()
# print c.temp.parent.html_anc_tags

BC547
BC547
BC547
BC857BT
BC847DG
BC847W
BC548
BC547
BC847AT
BC859W
BC847AT
BC547
BC857BW
BC847T
BC857AW
BC847DG
BC547
BC547
BC857W
BC849W
BC857
BC550
BC549
BC548
BC550
BC547
BC548
BC547
BC847BT
BC550
BC557B
BC546
BC847BT
BC548
BC847BT
BC847W
BC549
BC817KW
BC847BT
BC857AT
BC817K
BC546
BC860
BC847CW
BC549
BC818KW
BC847AT
BC550
BC849W
BC849W
BC547
BC857A
BC546
BC547
BC847BW
BC857A
BC550
BC847W
BC847CW
BC857BM
BC847T
BC546
BC549
BC857
BC548
BC847AW
BC546
BC818K
BC548
BC847AT
BC548
BC547
BC546
BC548
BC857BM
BC849W
BC847BM
BC847T
BC856W
BC549
BC857CT
BC548
BC846W
BC549
BC848W
BC847BW
BC847W
BC847CT
BC847BW
BC557C
BC857C
BC549
BC849W
BC546
BC549
BC546
BC817K
BC847W
BC547
BC847CT
BC847DG
BC818K
BC546
BC847BM
BC818K
BC546
BC847AT
BC846W
BC860
BC847AM
BC847W
BC857CW
BC859
BC847BM
BC847CW
BC846W
BC549
BC550
BC548
BC846W
BC847AT
BC857CT
BC847DG
BC846W
BC817K
BC550
BC847BW
BC557
BC547
BC847BW
BC859W
BC818K
BC849W
BC847CM
BC848W
BC847T
BC817K
BC857AM
BC856W
BC817KW
BC847BM
BC547
BC182BG
BC818KW
BC8

In [22]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.418
Corpus Recall    0.582
Corpus F1        0.486
----------------------------------------
TP: 71 | FP: 99 | FN: 51



In [23]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.