## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 3min 48s, sys: 36.7 s, total: 4min 25s
Wall time: 4min 48s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 2.83 s, sys: 218 ms, total: 3.05 s
Wall time: 3.22 s


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

Training marginals (!= 0.5):	75908
Features:			9
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.333918
	Learning epoch = 250	Gradient mag. = 0.401045
	Learning epoch = 500	Gradient mag. = 0.473478
	Learning epoch = 750	Gradient mag. = 0.511538
	Learning epoch = 1000	Gradient mag. = 0.492912
	Learning epoch = 1250	Gradient mag. = 0.460678
	Learning epoch = 1500	Gradient mag. = 0.453865
	Learning epoch = 1750	Gradient mag. = 0.472802
	Learning epoch = 2000	Gradient mag. = 0.463477
	Learning epoch = 2250	Gradient mag. = 0.414402
	Learning epoch = 2500	Gradient mag. = 0.357512
	Learning epoch = 2750	Gradient mag. = 0.311973
	Learning epoch = 3000	Gradient mag. = 0.275074
	Learning epoch = 3250	Gradient mag. = 0.244758
	Learning epoch = 3500	Gradient mag. = 0.219472
	Learning epoch = 3750	Gradient mag. = 0.198067
	Learning epoch = 4000	Gradient mag. = 0.179700
	Learning epoch = 4250	Gradient mag. = 0.163749
	Learning epoch = 4500	Gradient mag. = 0.149761
	Learn

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=1000, rate=1e-3)

Training marginals (!= 0.5):	75908
Features:			24073
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 52615.416182	Gradient magnitude = 126254.675834
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = inf	Gradient magnitude = 377.806667
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = inf	Gradient magnitude = 366.482727
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = inf	Gradient magnitude = 286.995481
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = inf	Gradient magnitude = 230.197220
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = inf	Gradient magnitude = 200.702270
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = inf	Gradient magnitude = 105.741497
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = inf	Gradient magnitude = 238.286145
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = inf	Gradient magnitude = 85.186766
	Learning epoch = 900	Step size = 0.000406386622545
	Loss = i

In [11]:
disc_model.w.shape

(24073,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 4.33 s, sys: 184 ms, total: 4.52 s
Wall time: 4.74 s


## Assess Performance on Development Set

In [13]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [14]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 2min 14s, sys: 49 s, total: 3min 3s
Wall time: 3min 28s


In [15]:
L_dev = label_manager.load(session, dev, "Hardware Development Labels -- Gold")

In [16]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [17]:
F_dev.shape

(18971, 24073)

In [18]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.554520692382
Neg. class accuracy: 0.698418972332
Corpus Precision 0.734
Corpus Recall    0.555
Corpus F1        0.632
----------------------------------------
TP: 6311 | FP: 2289 | TN: 5301 | FN: 5070

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.554520692382
Neg. class accuracy: 0.698418972332
Corpus Precision 0.734
Corpus Recall    0.555
Corpus F1        0.632
----------------------------------------
TP: 6311 | FP: 2289 | TN: 5301 | FN: 5070





In [19]:
from hardware_utils import part_error_analysis, table_info
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [20]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
# c = list(fp)[10]
# part_error_analysis(c)
# print c.temp.parent.row_num

# for c in list(fp)[:5]:
#     print c
# print c.temp.parent.html_anc_tags

In [21]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.749
Corpus Recall    0.98
Corpus F1        0.849
----------------------------------------
TP: 149 | FP: 50 | FN: 3



In [24]:
print FN

set([('PNJIS00254-1', 'BC807-16W', '-55'), ('PNJIS00254-1', 'BC807-25W', '-55'), ('PNJIS00254-1', 'BC807-40W', '-55')])


In [22]:
print FP

set([(u'PNJIS01593-1', u'BC858B', u'-65'), (u'PNJIS01593-1', u'BC857', u'-65'), (u'ONSMS04099-1', u'BC238C', u'-65'), (u'ZETXS01948-1', u'BC860C-4GZ', u'-65'), (u'ZETXS01948-1', u'BC857C-3G', u'-65'), (u'ZETXS01948-1', u'BC857B-3F', u'-65'), (u'ONSMS04099-1', u'BC238B', u'-65'), (u'ZETXS01948-1', u'BC860A-Z4E', u'-65'), (u'UTCLS02155-1', u'BC808-25', u'-50'), (u'UTCLS02155-1', u'BC808-40', u'-50'), (u'ZETXS01948-1', u'BC858B-3K', u'-65'), (u'ZETXS01948-1', u'BC856B-Z3B', u'-65'), (u'ZETXS01948-1', u'BC859B-4B', u'-65'), (u'PNJIS01593-1', u'BC857B', u'-65'), (u'ZETXS01948-1', u'BC858C-3L', u'-65'), (u'ZETXS01948-1', u'BC858', u'-65'), (u'UTCLS02155-1', u'BC807-16', u'-50'), (u'PNJIS01593-1', u'BC858C', u'-65'), (u'ZETXS01948-1', u'BC859', u'-65'), (u'UTCLS02155-1', u'BC807', u'-50'), (u'ONSMS04099-1', u'BC237A', u'-65'), (u'ZETXS01948-1', u'BC857', u'-65'), (u'PNJIS01593-1', u'BC859B', u'-65'), (u'ZETXS01948-1', u'BC858A-3J', u'-65'), (u'ONSMS04099-1', u'BC237B', u'-65'), (u'ZETXS01948-

In [23]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.