## Part V: Training a Model with Data Programming

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [63]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 3min 5s, sys: 19.5 s, total: 3min 25s
Wall time: 3min 29s


In [6]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 2.74 s, sys: 136 ms, total: 2.88 s
Wall time: 2.99 s


## Train Generative Model

In [7]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

Training marginals (!= 0.5):	85377
Features:			18
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.417921
	Learning epoch = 250	Gradient mag. = 0.552370
	Learning epoch = 500	Gradient mag. = 0.685059
	Learning epoch = 750	Gradient mag. = 0.824752
	Learning epoch = 1000	Gradient mag. = 0.908164
	Learning epoch = 1250	Gradient mag. = 0.915530
	Learning epoch = 1500	Gradient mag. = 0.826081
	Learning epoch = 1750	Gradient mag. = 0.727261
	Learning epoch = 2000	Gradient mag. = 0.637984
	Learning epoch = 2250	Gradient mag. = 0.565730
	Learning epoch = 2500	Gradient mag. = 0.506027
	Learning epoch = 2750	Gradient mag. = 0.455836
	Learning epoch = 3000	Gradient mag. = 0.411510
	Learning epoch = 3250	Gradient mag. = 0.345356
	Learning epoch = 3500	Gradient mag. = 0.295165
	Learning epoch = 3750	Gradient mag. = 0.256485
	Learning epoch = 4000	Gradient mag. = 0.226166
	Learning epoch = 4250	Gradient mag. = 0.201976
	Learning epoch = 4500	Gradient mag. = 0.182334
	Lear

In [8]:
gen_model.save(session, 'Generative Params')

In [9]:
train_marginals = gen_model.marginals(L_train)

## Train Discriminative Model

In [38]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=5e-4)

Training marginals (!= 0.5):	68829
Features:			19594
Using gradient descent...
	Learning epoch = 0	Step size = 0.0005
	Loss = 47708.627291	Gradient magnitude = 79617.912043
	Learning epoch = 100	Step size = 0.000452396073557
	Loss = 6485.003435	Gradient magnitude = 2435.064917
	Learning epoch = 200	Step size = 0.000409324414739
	Loss = 6057.270125	Gradient magnitude = 2524.610640
	Learning epoch = 300	Step size = 0.000370353516078
	Loss = 5188.565732	Gradient magnitude = 2606.181589
	Learning epoch = 400	Step size = 0.000335092953003
	Loss = 5283.661779	Gradient magnitude = 2620.239240
	Learning epoch = 500	Step size = 0.000303189472431
	Loss = 2414.806603	Gradient magnitude = 1166.856304
	Learning epoch = 600	Step size = 0.000274323453743
	Loss = 3636.575799	Gradient magnitude = 2215.302318
	Learning epoch = 700	Step size = 0.000248205706716
	Loss = 3027.708675	Gradient magnitude = 2336.312523
	Learning epoch = 800	Step size = 0.000224574574305
	Loss = 2687.413415	Gradient magnitude =

In [39]:
disc_model.w.shape

(19594,)

In [12]:
%time disc_model.save(session, "Discriminative Params")

CPU times: user 3.73 s, sys: 171 ms, total: 3.9 s
Wall time: 3.96 s


## Assess Performance on Development Set

In [40]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [14]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

CPU times: user 1min 33s, sys: 20.2 s, total: 1min 53s
Wall time: 2min 5s


In [15]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [16]:
# from snorkel.loaders import create_or_fetch
# from snorkel.models import Label, AnnotationKeySet
# annotation_key_name = "Hardware Development Labels - Gold"
# annotation_key = create_or_fetch(session, AnnotationKey, annotation_key_name)
# session.query(Label).filter(Label.key == annotation_key).count()

In [17]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
# from snorkel.utils import get_ORM_instance
# dev_labels = get_ORM_instance(AnnotationKeySet, session, 'Hardware Development Labels - Gold')
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [18]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [41]:
F_dev.shape

(9502, 19594)

In [42]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.733830845771
Neg. class accuracy: 0.670486435921
Corpus Precision 0.731
Corpus Recall    0.734
Corpus F1        0.733
----------------------------------------
TP: 3835 | FP: 1409 | TN: 2867 | FN: 1391

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.733830845771
Neg. class accuracy: 0.670486435921
Corpus Precision 0.731
Corpus Recall    0.734
Corpus F1        0.733
----------------------------------------
TP: 3835 | FP: 1409 | TN: 2867 | FN: 1391



In [21]:
from hardware_utils import part_error_analysis, table_info
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [22]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
# c = list(fp)[10]
# part_error_analysis(c)
# print c.temp.parent.row_num

# for c in list(fp)[:5]:
#     print c
# print c.temp.parent.html_anc_tags

In [43]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.801
Corpus Recall    1.0
Corpus F1        0.89
----------------------------------------
TP: 161 | FP: 40 | FN: 0



In [93]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list)

[(u'DISES00023-1', u'2N3906', u'-50'),
 (u'DISES00023-1', u'2N3906', u'-75'),
 (u'FAIRS19194-1', u'BC856', u'-50'),
 (u'FAIRS19194-1', u'BC856A', u'-50'),
 (u'FAIRS19194-1', u'BC856B', u'-50'),
 (u'FAIRS19194-1', u'BC856C', u'-50'),
 (u'FAIRS19194-1', u'BC857', u'-50'),
 (u'FAIRS19194-1', u'BC857A', u'-50'),
 (u'FAIRS19194-1', u'BC857B', u'-50'),
 (u'FAIRS19194-1', u'BC857C', u'-50'),
 (u'FAIRS19194-1', u'BC858', u'-50'),
 (u'FAIRS19194-1', u'BC858A', u'-50'),
 (u'FAIRS19194-1', u'BC858B', u'-50'),
 (u'FAIRS19194-1', u'BC858C', u'-50'),
 (u'FAIRS19194-1', u'BC859', u'-50'),
 (u'FAIRS19194-1', u'BC859A', u'-50'),
 (u'FAIRS19194-1', u'BC859B', u'-50'),
 (u'FAIRS19194-1', u'BC859C', u'-50'),
 (u'FAIRS19194-1', u'BC860', u'-50'),
 (u'FAIRS19194-1', u'BC860A', u'-50'),
 (u'FAIRS19194-1', u'BC860B', u'-50'),
 (u'FAIRS19194-1', u'BC860C', u'-50'),
 (u'KECCS03676-1', u'BC856', u'-50'),
 (u'KECCS03676-1', u'BC856A', u'-50'),
 (u'KECCS03676-1', u'BC856B', u'-50'),
 (u'KECCS03676-1', u'BC857', u'

In [100]:
# TODO: canonize some of these helper functions
def entity_to_candidates(entity, candidate_subset):
    matches = []
    for c in candidate_subset:
        if (c.part.parent.document.name, c.part.get_span(), c.temp.get_span()) == entity:
            matches.append(c)
    return matches

def features_by_weight(candidate, feature_matrix, feature_weights):
    feats = set(get_keys_by_candidate(feature_matrix, candidate))
    return sorted([f_w for f_w in feature_weights if f_w[0] in feats], 
                  key=lambda x:abs(x[1]), reverse=True)

In [101]:
entity = sorted(list(FP))[2]
print entity
print

matches = entity_to_candidates(entity, fp)
candidate = matches[0]
print candidate
print

pprint(features_by_weight(candidate, F_dev, feature_weights))

(u'FAIRS19194-1', u'BC856', u'-50')

Part_Temp(ImplicitSpan("BC856", parent=7455, words=[0,1], position=[1]), ImplicitSpan("-50", parent=88803, words=[0,0], position=[0]))

[(AnnotationKey (TAB_e2_HTML_ANC_TAG_[td]), 7.748686425705095),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[tr]), 7.748686425705095),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[table]), 7.748686425705095),
 (AnnotationKey (TAB_e2_COL_LEMMAS_[)]), 1.3959788387068297),
 (AnnotationKey (TAB_e1_HTML_TAG_h1), 1.2742354017962279),
 (AnnotationKey (TAB_e1_HTML_ANC_TAG_[h1]), 0.5195218411003486),
 (AnnotationKey (TAB_e2_COL_LEMMAS_[:]), 0.49680486281841213),
 (AnnotationKey (TAB_e1_HTML_ANC_TAG_[html]), 0.4710099186281582),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[body]), 0.4710099186281582),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[html]), 0.4710099186281582),
 (AnnotationKey (TAB_e2_HTML_TAG_p), 0.4710099186281582),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[p]), 0.4710099186281582),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[:]), 0.32175067810987884

In [98]:
from snorkel.utils import get_keys_by_candidate

# F_train
for f in get_keys_by_candidate(F_dev, candidate)[:]: print f

AnnotationKey (TAB_e1_HTML_ATTR_[style])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[body])
AnnotationKey (TAB_e2_HTML_TAG_p)
AnnotationKey (TAB_e2_HTML_ATTR_[style])
AnnotationKey (TAB_e2_HTML_ATTR_[class])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[body])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[p])
AnnotationKey (TAB_e1_HTML_TAG_h1)
AnnotationKey (TAB_e1_HTML_ANC_TAG_[h1])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[table])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[tr])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[td])
AnnotationKey (TAB_e2_ROW_NUM_[3])
AnnotationKey (TAB_e2_COL_NUM_[1])
AnnotationKey (TAB_e2_COL_LEMMAS_[collector])
AnnotationKey (TAB_e2_COL_LEMMAS_[collector current])
AnnotationKey (TAB_e2_COL_LEMMAS_[current])
AnnotationKey (TAB_e2_COL_LEMMAS_[:])
AnnotationKey (TAB_e2_COL_LEMMAS_[voltage])
AnnotationKey (TAB_e2_COL_LEMMAS_[(])
AnnotationKey (TAB_e2_COL_LEMMAS_[)])
AnnotationKey (TAB_e2_COL_LEMMAS_[collector-emitter])
Ann

In [26]:
import os
os.system('cp snorkel.db snorkel.db\ final');

In [99]:
from pprint import pprint
feature_weights = disc_model.get_feature_weights(F_train, reverse=False)

TypeError: get_feature_weights() got an unexpected keyword argument 'reverse'

In [72]:
feature_weights[0][0]

AnnotationKey (TAB_e2_ROW_LEMMAS_[temperature])

In [46]:
pprint(feature_weights[:60])

[(AnnotationKey (TAB_e2_ROW_LEMMAS_[temperature]), 21.80470912655765),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[t]), 21.278126010205835),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[storage]), 17.90559549236179),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[% c]), 15.599014367520684),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[+150]), 15.167438781987517),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[stg]), 14.291424781156206),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[t stg]), 14.289430354927065),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[storage temperature]), 13.409695418499963),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[%]), 13.107554257735028),
 (AnnotationKey (TAB_e2_COL_LEMMAS_[65]), 12.947177288663257),
 (AnnotationKey (TAB_e2_ROW_LEMMAS_[c]), 12.388329038726159),
 (AnnotationKey (TAB_e2_RIGHT_LEMMAS_[+150]), 11.636808774913009),
 (AnnotationKey (TAB_e2_RIGHT_LEMMAS_[150]), 10.379317105185747),
 (AnnotationKey (TAB_e2_COL_LEMMAS_[100]), 10.130195545606679),
 (AnnotationKey (TAB_e2_RIGHT_LEMMAS_[to +150]), 9.575210481424996),
 (Annota

In [29]:
fp_zetx = set()
print(len(fn))
for f in fn:
    if (str(f.part.parent.document) == "Document PNJIS00254-1"):
        fp_zetx.add(f)

print(len(fp_zetx))

1335
0


In [None]:
from snorkel.utils import get_keys_by_candidate
for a in sorted(get_keys_by_candidate(F_dev, sample)):
    print a

In [None]:
from snorkel.lf_helpers import *
def LF_min_col(c):
    return -1 if 'min' in get_col_ngrams(c.temp) or 'min.' in get_col_ngrams(c.temp) else 0

LF_min_col(sample)

The End.