## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [5]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Candidates').one()

In [6]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 1min 4s, sys: 3.53 s, total: 1min 8s
Wall time: 1min 10s


In [7]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 512 ms, sys: 22.4 ms, total: 534 ms
Wall time: 563 ms


## Train Generative Model

In [8]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)

Training marginals (!= 0.5):	12617
Features:			13
Begin training for rate=1e-05, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.327890
	Learning epoch = 250	Gradient mag. = 0.345153
	Learning epoch = 500	Gradient mag. = 0.345159
	Learning epoch = 750	Gradient mag. = 0.345165
	Learning epoch = 1000	Gradient mag. = 0.345172
	Learning epoch = 1250	Gradient mag. = 0.345178
	Learning epoch = 1500	Gradient mag. = 0.345184
	Learning epoch = 1750	Gradient mag. = 0.345191
	Learning epoch = 2000	Gradient mag. = 0.345197
	Learning epoch = 2250	Gradient mag. = 0.345204
	Learning epoch = 2500	Gradient mag. = 0.345211
	Learning epoch = 2750	Gradient mag. = 0.345217
Final gradient magnitude for rate=1e-05, mu=1e-06: 0.345


In [9]:
gen_model.save(session, 'Generative Params')

In [10]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [11]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

Training marginals (!= 0.5):	12617
Features:			37807
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 8745.437977	Gradient magnitude = 19025.453036
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 158778.200331	Gradient magnitude = 10640.831442
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 166334.229979	Gradient magnitude = 11417.179570
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 65431.459497	Gradient magnitude = 15820.618016
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 100685.065112	Gradient magnitude = 7863.008170
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 132594.788673	Gradient magnitude = 10855.530229
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 173342.078832	Gradient magnitude = 16105.447479
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 46455.767236	Gradient magnitude = 9580.642839
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 83739.935310	Gradi

In [12]:
disc_model.w.shape

(37807,)

In [13]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 10.5 s, sys: 391 ms, total: 10.9 s
Wall time: 10.8 s


## Assess Performance on Development Set

In [15]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Candidates').one()

### Loading the Development Feature Matrix

In [16]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 1min 7s, sys: 3.25 s, total: 1min 11s
Wall time: 1min 14s


In [17]:
L_dev = label_manager.load(session, dev, "Hardware Labels -- Gold")

In [18]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Candidates -- Gold').one()

In [19]:
F_dev.shape

(12617, 37807)

In [20]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.706141732283
Neg. class accuracy: 0.845901292099
Corpus Precision 0.606
Corpus Recall    0.706
Corpus F1        0.653
----------------------------------------
TP: 2242 | FP: 1455 | TN: 7987 | FN: 933

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.706141732283
Neg. class accuracy: 0.845901292099
Corpus Precision 0.606
Corpus Recall    0.706
Corpus F1        0.653
----------------------------------------
TP: 2242 | FP: 1455 | TN: 7987 | FN: 933



In [21]:
# def error_analysis(c):
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [22]:
# c = list(tp)[10]
# error_analysis(c)
# print c.temp.parent.html_anc_tags

In [25]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()
entity_level_f1(tp, fp, tn, fn, filename, corpus, 'stg_temp_min', relation=True)

Scoring on Entity-Level Gold Data
Corpus Precision 0.635
Corpus Recall    0.706
Corpus F1        0.668
----------------------------------------
TP: 384 | FP: 221 | FN: 160



In [22]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.