## Part V: Training a Model with Data Programming

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

In [None]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`, feature matrix, and label matrix

In [None]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

%time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

%time L_train = label_manager.load(session, train, 'LF Labels')

## Train Generative Model

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
# gen_model.train(L_train, n_iter=3000, rate=1e-5)
gen_model.train(L_train, n_iter=15000, rate=1e-2)

In [None]:
gen_model.save(session, 'Generative Params')

In [None]:
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import odds_to_prob
print LFs
print gen_model.w
# print odds_to_prob(gen_model.w)

## Train Discriminative Model

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=5000, rate=1e-3)

In [None]:
disc_model.w.shape

In [None]:
%time disc_model.save(session, "Discriminative Params")

## Assess Performance on Development Set

In [None]:
from snorkel.models import CandidateSet
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Loading the Development Feature Matrix

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()
%time F_dev = feature_manager.load(session, dev, 'Train Features')

In [None]:
L_dev = label_manager.load(session, dev, "Hardware Development Labels -- Gold")

In [None]:
gold_dev_set = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [None]:
F_dev.shape

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, gold_dev_set)

In [None]:
def error_analysis(c):
    print "Doc: %s" % c.part.parent.document
    print "------------"
    print "Part:"
    print c.part
    table_info(c.part)
    print "------------"
    print "Temp:"
    print c.temp
    table_info(c.temp)

def table_info(span):
    print "Table: %s" % span.parent.table
    if span.parent.cell:
        print "Row: %s" % span.parent.row_num
        print "Col: %s" % span.parent.col_num
    print "Phrase: %s" % span.parent
#     print c[0]
#     print c[0].parent
#     print (c[0].parent.row_num, c[0].parent.col_num)
#     print c[1]
#     print c[1].parent
#     print (c[1].parent.row_num, c[1].parent.col_num)

In [None]:
# for c in list(fp)[:5]:
#     print c
# c = list(fp)[10]
# error_analysis(c)
for c in list(fp):
    print c.part.get_span()
# print c.temp.parent.html_anc_tags

In [None]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_min')

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.