## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 1min 29s, sys: 2.51 s, total: 1min 32s
Wall time: 1min 31s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 632 ms, sys: 24 ms, total: 656 ms
Wall time: 655 ms


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

Training marginals (!= 0.5):	75908
Features:			20
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.572158
	Learning epoch = 250	Gradient mag. = 0.982846
	Learning epoch = 500	Gradient mag. = 1.401972
	Learning epoch = 750	Gradient mag. = 1.260039
	Learning epoch = 1000	Gradient mag. = 0.992298
	Learning epoch = 1250	Gradient mag. = 0.857444
	Learning epoch = 1500	Gradient mag. = 0.779610
	Learning epoch = 1750	Gradient mag. = 0.588800
	Learning epoch = 2000	Gradient mag. = 0.487811
	Learning epoch = 2250	Gradient mag. = 0.414160
	Learning epoch = 2500	Gradient mag. = 0.362378
	Learning epoch = 2750	Gradient mag. = 0.323178
	Learning epoch = 3000	Gradient mag. = 0.292466
	Learning epoch = 3250	Gradient mag. = 0.268084
	Learning epoch = 3500	Gradient mag. = 0.248648
	Learning epoch = 3750	Gradient mag. = 0.233141
	Learning epoch = 4000	Gradient mag. = 0.220764
	Learning epoch = 4250	Gradient mag. = 0.210876
	Learning epoch = 4500	Gradient mag. = 0.197469
	Lear

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=4500, rate=1e-3)

Training marginals (!= 0.5):	33093
Features:			24073
Using gradient descent...
	Learning epoch = 0	Step size = 0.001
	Loss = 22938.319646	Gradient magnitude = 62831.366282
	Learning epoch = 100	Step size = 0.000904792147114
	Loss = 43391.402484	Gradient magnitude = 5459.416507
	Learning epoch = 200	Step size = 0.000818648829479
	Loss = 31504.367903	Gradient magnitude = 14696.829717
	Learning epoch = 300	Step size = 0.000740707032156
	Loss = 53642.152794	Gradient magnitude = 5459.480778
	Learning epoch = 400	Step size = 0.000670185906007
	Loss = 26562.768417	Gradient magnitude = 5459.444492
	Learning epoch = 500	Step size = 0.000606378944861
	Loss = 27931.479692	Gradient magnitude = 18120.553107
	Learning epoch = 600	Step size = 0.000548646907485
	Loss = 14809.948572	Gradient magnitude = 5456.451445
	Learning epoch = 700	Step size = 0.000496411413431
	Loss = 9496.789460	Gradient magnitude = 5453.709111
	Learning epoch = 800	Step size = 0.00044914914861
	Loss = 18735.689691	Gradient magn

In [11]:
disc_model.w.shape

(24073,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 2.8 s, sys: 120 ms, total: 2.92 s
Wall time: 2.92 s


## Assess Performance on Development Set

In [13]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [14]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 54.4 s, sys: 1.47 s, total: 55.9 s
Wall time: 55.5 s


In [15]:
L_dev = label_manager.load(session, dev, "Hardware Development Labels -- Gold")

In [16]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [17]:
F_dev.shape

(18971, 24073)

In [18]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.782620156401
Neg. class accuracy: 0.627404479578
Corpus Precision 0.759
Corpus Recall    0.783
Corpus F1        0.771
----------------------------------------
TP: 8907 | FP: 2828 | TN: 4762 | FN: 2474

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.782620156401
Neg. class accuracy: 0.627404479578
Corpus Precision 0.759
Corpus Recall    0.783
Corpus F1        0.771
----------------------------------------
TP: 8907 | FP: 2828 | TN: 4762 | FN: 2474



In [19]:
from hardware_utils import part_error_analysis, table_info
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [20]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
# c = list(fp)[10]
# part_error_analysis(c)
# print c.temp.parent.row_num

# for c in list(fp)[:5]:
#     print c
# print c.temp.parent.html_anc_tags

In [21]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.717
Corpus Recall    1.0
Corpus F1        0.835
----------------------------------------
TP: 152 | FP: 60 | FN: 0



In [22]:
print FN

set([])


In [23]:
from pprint import pprint
pprint(FP)

set([(u'ONSMS04099-1', u'BC237', u'-65'),
     (u'ONSMS04099-1', u'BC237A', u'-65'),
     (u'ONSMS04099-1', u'BC237B', u'-65'),
     (u'ONSMS04099-1', u'BC237C', u'-65'),
     (u'ONSMS04099-1', u'BC238', u'-65'),
     (u'ONSMS04099-1', u'BC238B', u'-65'),
     (u'ONSMS04099-1', u'BC238C', u'-65'),
     (u'ONSMS04099-1', u'BC239', u'-65'),
     (u'ONSMS04099-1', u'BC239C', u'-65'),
     (u'PHGLS19500-1', u'MMBT3904', u'-55'),
     (u'PNJIS01593-1', u'BC856', u'-65'),
     (u'PNJIS01593-1', u'BC856A', u'-65'),
     (u'PNJIS01593-1', u'BC856B', u'-65'),
     (u'PNJIS01593-1', u'BC857', u'-65'),
     (u'PNJIS01593-1', u'BC857A', u'-65'),
     (u'PNJIS01593-1', u'BC857B', u'-65'),
     (u'PNJIS01593-1', u'BC857C', u'-65'),
     (u'PNJIS01593-1', u'BC858', u'-65'),
     (u'PNJIS01593-1', u'BC858A', u'-65'),
     (u'PNJIS01593-1', u'BC858B', u'-65'),
     (u'PNJIS01593-1', u'BC858C', u'-65'),
     (u'PNJIS01593-1', u'BC859', u'-65'),
     (u'PNJIS01593-1', u'BC859B', u'-65'),
     (u'PNJIS015

In [24]:
import os
os.system('cp snorkel.db snorkel.db\ final');

In [25]:
from pprint import pprint
all_weights = disc_model.get_feature_weights(F_train)

In [26]:
pprint(all_weights[:60])

[(AnnotationKey (TAB_e2_ROW_WORDS_[temperature]), 11.774387696354319),
 (AnnotationKey (TAB_e2_ROW_WORDS_[t]), 11.571101992002395),
 (AnnotationKey (TAB_e2_ROW_WORDS_[storage]), 10.254517912459201),
 (AnnotationKey (TAB_e2_ROW_WORDS_[% c]), 8.355035669182524),
 (AnnotationKey (TAB_e2_ROW_WORDS_[stg]), 8.243764393149322),
 (AnnotationKey (TAB_e2_ROW_WORDS_[t stg]), 8.242765450291907),
 (AnnotationKey (TAB_e2_ROW_WORDS_[storage temperature]), 8.154853905423959),
 (AnnotationKey (TAB_e2_ROW_WORDS_[+150]), 7.62012982887663),
 (AnnotationKey (TAB_e2_COL_WORDS_[65]), 6.056303695414473),
 (AnnotationKey (TAB_e2_ROW_WORDS_[%]), 5.75508992719667),
 (AnnotationKey (TAB_e2_ROW_WORDS_[c]), 5.170811732721998),
 (AnnotationKey (TAB_e2_COL_WORDS_[+150]), 5.0151702315085345),
 (AnnotationKey (TAB_e2_ROW_WORDS_[range]), 4.909644840384417),
 (AnnotationKey (TAB_e2_ROW_WORDS_[temperature range]), 4.909644840384417),
 (AnnotationKey (TAB_e2_W_LEFT_POS_3_[CD TO CD]), 4.887981932104748),
 (AnnotationKey (TA

In [27]:
fp_zetx = set()
print(len(fp))
for f in fp:
    if str(f.part.parent.document) == "Document ZETXS01948-1":
        fp_zetx.add(f)

print(len(fp_zetx))

2828
265


In [29]:
sample = fp_zetx.pop()

Part_Temp(ImplicitSpan("BC857A-Z3E", parent=106607, words=[0,0], position=[16]), ImplicitSpan("-65", parent=106275, words=[0,0], position=[0]))


In [35]:
from snorkel.utils import get_keys_by_candidate
for a in sorted(get_keys_by_candidate(F_dev, sample)):
    print a

AnnotationKey (TAB_e1_ROW_WORDS_[75])
AnnotationKey (TAB_e1_WORD_SEQ_[Group])
AnnotationKey (TAB_e1_LEMMA_SEQ_[group])
AnnotationKey (TAB_e1_W_LEMMA_L_2_R_1_[group b]_[b])
AnnotationKey (TAB_e1_COL_WORDS_[forward])
AnnotationKey (TAB_e1_COL_WORDS_[typ])
AnnotationKey (TAB_e1_COL_WORDS_[group c])
AnnotationKey (TAB_e2_COL_WORDS_[330])
AnnotationKey (TAB_e1_ROW_WORDS_[group b])
AnnotationKey (TAB_e1_COL_WORDS_[group b])
AnnotationKey (TAB_e1_W_LEFT_2_[group b])
AnnotationKey (TAB_e2_ROW_WORDS_[-45])
AnnotationKey (TAB_e2_ROW_WORDS_[-30])
AnnotationKey (TAB_e1_COL_WORDS_[max])
AnnotationKey (TAB_e2_COL_WORDS_[-200])
AnnotationKey (TAB_e1_ROW_WORDS_[ratio])
AnnotationKey (TAB_e1_COL_WORDS_[group a])
AnnotationKey (TAB_e1_COL_WORDS_[group])
AnnotationKey (TAB_e1_COL_WORDS_[ratio])
AnnotationKey (TAB_e1_COL_WORDS_[collector-base capacitance])
AnnotationKey (TAB_e1_ROW_WORDS_[group a])
AnnotationKey (TAB_e1_ROW_WORDS_[group])
AnnotationKey (TAB_e1_ROW_WORDS_[150])
AnnotationKey (TAB_e1_ROW_WO

In [36]:
from snorkel.lf_helpers import *
def LF_v_row(c):
    return -1 if 'v' in get_row_ngrams(c.temp, infer=True) else 0

LF_v_row(sample)

-1

The End.