## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 4min 1s, sys: 45.5 s, total: 4min 47s
Wall time: 5min 36s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 4.1 s, sys: 242 ms, total: 4.35 s
Wall time: 4.51 s


## Train Generative Model

In [9]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=10000, rate=1e-2)

Training marginals (!= 0.5):	85377
Features:			16
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.511392
	Learning epoch = 250	Gradient mag. = 0.639607
	Learning epoch = 500	Gradient mag. = 0.741048
	Learning epoch = 750	Gradient mag. = 0.709041
	Learning epoch = 1000	Gradient mag. = 0.683565
	Learning epoch = 1250	Gradient mag. = 0.659029
	Learning epoch = 1500	Gradient mag. = 0.627494
	Learning epoch = 1750	Gradient mag. = 0.504253
	Learning epoch = 2000	Gradient mag. = 0.408535
	Learning epoch = 2250	Gradient mag. = 0.354975
	Learning epoch = 2500	Gradient mag. = 0.324150
	Learning epoch = 2750	Gradient mag. = 0.306037
	Learning epoch = 3000	Gradient mag. = 0.295676
	Learning epoch = 3250	Gradient mag. = 0.290485
	Learning epoch = 3500	Gradient mag. = 0.287667
	Learning epoch = 3750	Gradient mag. = 0.284002
	Learning epoch = 4000	Gradient mag. = 0.280898
	Learning epoch = 4250	Gradient mag. = 0.251675
	Learning epoch = 4500	Gradient mag. = 0.227288
	Lear

In [10]:
gen_model.save(session, 'Generative Params')

In [11]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [12]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)

Training marginals (!= 0.5):	85377
Features:			21487
Using gradient descent...
	Learning epoch = 0	Step size = 0.0001
	Loss = 59178.826835	Gradient magnitude = 126511.313821
	Learning epoch = 100	Step size = 9.04792147114e-05
	Loss = 1462.041410	Gradient magnitude = 114.483107
	Learning epoch = 200	Step size = 8.18648829479e-05
	Loss = 1392.771015	Gradient magnitude = 67.270905
	Learning epoch = 300	Step size = 7.40707032156e-05
	Loss = 1370.311932	Gradient magnitude = 42.407042
	Learning epoch = 400	Step size = 6.70185906007e-05
	Loss = 1360.853769	Gradient magnitude = 32.620546
	Learning epoch = 500	Step size = 6.06378944861e-05
	Loss = 1355.064618	Gradient magnitude = 28.697119
	Learning epoch = 600	Step size = 5.48646907485e-05
	Loss = 1350.789575	Gradient magnitude = 26.613616
	Learning epoch = 700	Step size = 4.96411413431e-05
	Loss = 1347.384439	Gradient magnitude = 25.273002
	Learning epoch = 800	Step size = 4.4914914861e-05
	Loss = 1344.570160	Gradient magnitude = 24.308126
	L

In [13]:
disc_model.w.shape

(21487,)

In [14]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 5.56 s, sys: 479 ms, total: 6.04 s
Wall time: 6.5 s


## Assess Performance on Development Set

In [15]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [16]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 1min 45s, sys: 36.8 s, total: 2min 22s
Wall time: 2min 51s


In [17]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [18]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [19]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [20]:
F_dev.shape

(9502, 21487)

### Calculating F1 Score

In [21]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.726750861079
Neg. class accuracy: 1.0
Corpus Precision 1.0
Corpus Recall    0.727
Corpus F1        0.842
----------------------------------------
TP: 3798 | FP: 0 | TN: 4276 | FN: 1428

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.726750861079
Neg. class accuracy: 1.0
Corpus Precision 1.0
Corpus Recall    0.727
Corpus F1        0.842
----------------------------------------
TP: 3798 | FP: 0 | TN: 4276 | FN: 1428





In [23]:
from hardware_utils import part_error_analysis, table_info

In [24]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
# c = list(fp)[10]
# part_error_analysis(c)
# print c.temp.parent.row_num

# for c in list(fp)[:5]:
#     print c
# print c.temp.parent.html_anc_tags

In [25]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 1.0
Corpus Recall    1.0
Corpus F1        1.0
----------------------------------------
TP: 161 | FP: 0 | FN: 0



In [None]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list)

In [None]:
feature_weights = disc_model.get_feature_weights(F_train, reverse=False)

In [None]:
pprint(feature_weights[:25])

In [None]:
# TODO: canonize some of these helper functions
# from snorkel.utils import get_keys_by_candidate

# def features_by_weight(candidate, feature_matrix, feature_weights):
#     feats = set(get_keys_by_candidate(feature_matrix, candidate))
#     return sorted([f_w for f_w in feature_weights if f_w[0] in feats], 
#                   key=lambda x:abs(x[1]), reverse=True)

In [None]:
from hardware_utils import entity_to_candidates
from pprint import pprint

entity = sorted(list(FP))[6]
print entity
print

matches = entity_to_candidates(entity, fp)
candidate = matches[0]
print candidate
print

pprint(disc_model.get_candidate_feature_weights(
        candidate, F_dev, feature_weights=feature_weights))

In [None]:
total = 0
for f,v in features_by_weight(candidate, F_dev, feature_weights):
    total += v
print total

In [None]:
from snorkel.utils import get_keys_by_candidate

# F_train
for f in get_keys_by_candidate(F_dev, candidate)[:]: print f

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');

In [None]:
from snorkel.utils import get_keys_by_candidate
for a in sorted(get_keys_by_candidate(F_dev, sample)):
    print a

In [None]:
from snorkel.lf_helpers import *
def LF_min_col(c):
    return -1 if 'min' in get_col_ngrams(c.temp) or 'min.' in get_col_ngrams(c.temp) else 0

LF_min_col(sample)

The End.