## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 1min 5s, sys: 1.32 s, total: 1min 6s
Wall time: 1min 6s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 1.24 s, sys: 24 ms, total: 1.26 s
Wall time: 1.26 s


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

Training marginals (!= 0.5):	75908
Features:			18
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.447559
	Learning epoch = 250	Gradient mag. = 0.643923
	Learning epoch = 500	Gradient mag. = 0.856812
	Learning epoch = 750	Gradient mag. = 1.026238
	Learning epoch = 1000	Gradient mag. = 0.885512
	Learning epoch = 1250	Gradient mag. = 0.730135
	Learning epoch = 1500	Gradient mag. = 0.627682
	Learning epoch = 1750	Gradient mag. = 0.559833
	Learning epoch = 2000	Gradient mag. = 0.514375
	Learning epoch = 2250	Gradient mag. = 0.449767
	Learning epoch = 2500	Gradient mag. = 0.373345
	Learning epoch = 2750	Gradient mag. = 0.327320
	Learning epoch = 3000	Gradient mag. = 0.297182
	Learning epoch = 3250	Gradient mag. = 0.276133
	Learning epoch = 3500	Gradient mag. = 0.260928
	Learning epoch = 3750	Gradient mag. = 0.249632
	Learning epoch = 4000	Gradient mag. = 0.240899
	Learning epoch = 4250	Gradient mag. = 0.233727
	Learning epoch = 4500	Gradient mag. = 0.227319
	Lear

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=4500, rate=1e-3)

Training marginals (!= 0.5):	58499
Features:			19168
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 40548.416916	Gradient magnitude = 75386.358069
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 5051.332712	Gradient magnitude = 1373.242792
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 4308.227357	Gradient magnitude = 1373.217391
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 3660.680681	Gradient magnitude = 1373.116297
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 1450.698287	Gradient magnitude = 1336.335015
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 1490.234457	Gradient magnitude = 3341.868626
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 1740.641452	Gradient magnitude = 1369.408394
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 1114.921893	Gradient magnitude = 1330.832552
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 639.857801	Gradient magnitude = 15

In [11]:
disc_model.w.shape

(19168,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 2.48 s, sys: 48 ms, total: 2.53 s
Wall time: 2.64 s


## Assess Performance on Development Set

In [None]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Silver').one()

In [None]:
# from snorkel.loaders import create_or_fetch
# from snorkel.models import Label, AnnotationKeySet
# annotation_key_name = "Hardware Development Labels - Gold"
# annotation_key = create_or_fetch(session, AnnotationKey, annotation_key_name)
# session.query(Label).filter(Label.key == annotation_key).count()

In [None]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
# from snorkel.utils import get_ORM_instance
# dev_labels = get_ORM_instance(AnnotationKeySet, session, 'Hardware Development Labels - Gold')
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Silver')

In [None]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Silver').one()

In [None]:
F_dev.shape

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

In [None]:
from hardware_utils import part_error_analysis, table_info
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [None]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
# c = list(fp)[10]
# part_error_analysis(c)
# print c.temp.parent.row_num

# for c in list(fp)[:5]:
#     print c
# print c.temp.parent.html_anc_tags

In [None]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

In [None]:
print FN

In [None]:
from pprint import pprint
pprint(FP)

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');

In [None]:
from pprint import pprint
all_weights = disc_model.get_feature_weights(F_train)

In [None]:
pprint(all_weights[:60])

In [None]:
fp_zetx = set()
print(len(fn))
for f in fn:
    if (str(f.part.parent.document) == "Document PNJIS00254-1"):
        fp_zetx.add(f)

print(len(fp_zetx))

In [None]:
sample = fp_zetx.pop()
print sample
print sample.temp.parent

In [None]:
from snorkel.utils import get_keys_by_candidate
for a in sorted(get_keys_by_candidate(F_dev, sample)):
    print a

In [None]:
from snorkel.lf_helpers import *
def LF_min_col(c):
    return -1 if 'min' in get_col_ngrams(c.temp) or 'min.' in get_col_ngrams(c.temp) else 0

LF_min_col(sample)

The End.