## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 505 ms, sys: 13.6 ms, total: 519 ms
Wall time: 521 ms


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 37.6 ms, sys: 1.75 ms, total: 39.4 ms
Wall time: 39 ms


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=3000, rate=1e-5)

Training marginals (!= 0.5):	147
Features:			11
Begin training for rate=1e-05, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.901992
	Learning epoch = 250	Gradient mag. = 0.950946
	Learning epoch = 500	Gradient mag. = 0.952431
	Learning epoch = 750	Gradient mag. = 0.953919
	Learning epoch = 1000	Gradient mag. = 0.955410
	Learning epoch = 1250	Gradient mag. = 0.956905
	Learning epoch = 1500	Gradient mag. = 0.958402
	Learning epoch = 1750	Gradient mag. = 0.959903
	Learning epoch = 2000	Gradient mag. = 0.961407
	Learning epoch = 2250	Gradient mag. = 0.962914
	Learning epoch = 2500	Gradient mag. = 0.964425
	Learning epoch = 2750	Gradient mag. = 0.965938
Final gradient magnitude for rate=1e-05, mu=1e-06: 0.967


In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

Training marginals (!= 0.5):	147
Features:			5173
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 101.892636	Gradient magnitude = 506.976278
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 18.862408	Gradient magnitude = 2.505034
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 18.571929	Gradient magnitude = 1.332697
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 18.476752	Gradient magnitude = 0.909970
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 18.431951	Gradient magnitude = 0.696502
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 18.406512	Gradient magnitude = 0.572222
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 18.390241	Gradient magnitude = 0.492717
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 18.378970	Gradient magnitude = 0.438191
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 18.370718	Gradient magnitude = 0.398775
	Learning epoch = 900	Step size = 0.

In [11]:
disc_model.w.shape

(5173,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 1.56 s, sys: 22.1 ms, total: 1.58 s
Wall time: 1.37 s


## Assess Performance on Dev Set

In [13]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

In [14]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', False)


Loading sparse Feature matrix...
CPU times: user 2min 48s, sys: 9.11 s, total: 2min 57s
Wall time: 2min 58s


In [15]:
L_dev = label_manager.load(session, dev, "Hardware Training Labels -- Gold")

In [16]:
gold_dev_set = session.query(CandidateSet).filter(CandidateSet.name == 'Hardware Development Candidates').one()

In [17]:
F_dev.shape

(405, 5173)

In [18]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Test set size:	405
----------------------------------------
Pos. class accuracy: 0.0
Neg. class accuracy: 1.0
----------------------------------------
Precision:	nan
Recall:		0.0
F1 Score:	nan
----------------------------------------
TP: 0 | FP: 0 | TN: 341 | FN: 64
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.0
Neg. class accuracy: 1.0
Corpus Precision nan
Corpus Recall    0.0
Corpus F1        nan
----------------------------------------
TP: 0 | FP: 0 | TN: 341 | FN: 64





In [19]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
entity_level_f1(tp, fp, tn, fn, filename, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision nan
Corpus Recall    0.0
Corpus F1        nan
----------------------------------------
TP: 0 | FP: 0 | FN: 159



In [20]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.