## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 27.9 s, sys: 1.3 s, total: 29.2 s
Wall time: 29.6 s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 512 ms, sys: 23.6 ms, total: 536 ms
Wall time: 571 ms


## Train Generative Model

In [34]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=10000, rate=1e-3)

Training marginals (!= 0.5):	11549
Features:			13
Begin training for rate=0.001, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.328638
	Learning epoch = 250	Gradient mag. = 0.349080
	Learning epoch = 500	Gradient mag. = 0.353049
	Learning epoch = 750	Gradient mag. = 0.357757
	Learning epoch = 1000	Gradient mag. = 0.363124
	Learning epoch = 1250	Gradient mag. = 0.369074
	Learning epoch = 1500	Gradient mag. = 0.375534
	Learning epoch = 1750	Gradient mag. = 0.382440
	Learning epoch = 2000	Gradient mag. = 0.389731
	Learning epoch = 2250	Gradient mag. = 0.397351
	Learning epoch = 2500	Gradient mag. = 0.405251
	Learning epoch = 2750	Gradient mag. = 0.413388
	Learning epoch = 3000	Gradient mag. = 0.421722
	Learning epoch = 3250	Gradient mag. = 0.430219
	Learning epoch = 3500	Gradient mag. = 0.438847
	Learning epoch = 3750	Gradient mag. = 0.447580
	Learning epoch = 4000	Gradient mag. = 0.456394
	Learning epoch = 4250	Gradient mag. = 0.465266
	Learning epoch = 4500	Gradient mag. = 0.474179
	Lea

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

In [35]:
print gen_model.w

[ 2.46336601  1.42005359  0.99999     2.01689283  3.36379218  0.99999
  0.50634606  0.56627684  1.2349349   4.56555276  1.52739283  0.99653459
  2.31319165]


## Train Discriminative Model

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

Training marginals (!= 0.5):	11549
Features:			19955
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 8005.156788	Gradient magnitude = 11166.959807
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 35996.843633	Gradient magnitude = 6470.501391
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 58941.793690	Gradient magnitude = 29104.813215
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 67702.749222	Gradient magnitude = 7983.638145
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 50830.283398	Gradient magnitude = 15359.215311
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 61617.204091	Gradient magnitude = 8399.724903
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 20968.759949	Gradient magnitude = 5578.592258
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 35346.350507	Gradient magnitude = 6105.839504
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 31588.096425	Gradient magn

In [11]:
disc_model.w.shape

(19955,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 5.78 s, sys: 193 ms, total: 5.97 s
Wall time: 6.02 s


## Assess Performance on Development Set

In [13]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [14]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 16 s, sys: 688 ms, total: 16.7 s
Wall time: 17 s


In [15]:
L_dev = label_manager.load(session, dev, "Hardware Development Labels -- Gold")

In [16]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [17]:
F_dev.shape

(3163, 19955)

In [18]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.726519337017
Neg. class accuracy: 0.751874330596
Corpus Precision 0.275
Corpus Recall    0.727
Corpus F1        0.398
----------------------------------------
TP: 263 | FP: 695 | TN: 2106 | FN: 99

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.726519337017
Neg. class accuracy: 0.751874330596
Corpus Precision 0.275
Corpus Recall    0.727
Corpus F1        0.398
----------------------------------------
TP: 263 | FP: 695 | TN: 2106 | FN: 99





In [44]:
def error_analysis(c):
    print "Doc: %s" % c[0].parent.document
    print "Part:"
    print c.part
    table_info(c.part)
    print "------------"
    print "Temp:"
    print c.temp
    table_info(c.temp)

def table_info(span):
    print "Table: %s" % span.parent.table
    if span.parent.cell:
        print "Row: %s" % span.parent.row_num
        print "Col: %s" % span.parent.col_num
    print "Phrase"
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [45]:
# for c in list(fp)[:5]:
#     print c
c = list(fp)[0]
error_analysis(c)
# print c.temp.parent.html_anc_tags

Doc: Document PHGLS18216-1
Part:
ImplicitSpan("BC847DG", parent=95266, words=[0,0], position=[0])
Table: Table(Document PHGLS18216-1, 9)
Row: 8
Col: 0
Temp: ImplicitSpan("-65", parent=95349, words=[0,0], position=[0])


In [20]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.385
Corpus Recall    0.574
Corpus F1        0.461
----------------------------------------
TP: 70 | FP: 112 | FN: 52



In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.