# DC_GAIN_MIN

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=4)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
from snorkel.models import candidate_subclass

Part_Gain = candidate_subclass('Part_Gain', ['part','gain'])

### Define Matchers

In [3]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

gain_matcher = RegexMatchSpan(rgx=r'\d+[05]', longest_match_only=False)

### Define ContextSpaces

In [4]:
import os
from collections import defaultdict
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])
    
part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)
gain_ngrams = OmniNgrams(n_max=1)

### Define Candidate Throttler

In [5]:
from snorkel.lf_helpers import *
def gain_throttler((part_span, attr_span)):
    """
    returns True if:
            the part is global (i.e., is not in a table)
            OR
            if the part and attr spans are aligned 
        AND
            the attribute (number) is aligned with a "DC gain min"-like word
    """
    return ((part_span.parent.table is None or
            part_span.parent.row_num == attr_span.parent.row_num or
            part_span.parent.col_num == attr_span.parent.col_num) and
            overlap(['dc', 'gain', 'hfe', 'fe', 'min'], get_aligned_ngrams(attr_span, infer=True)))

### Run CandidateExtractor

In [6]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Gain, 
                        [part_ngrams, gain_ngrams], 
                        [parts_matcher, gain_matcher],
                        gain_throttler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Training)
CPU times: user 13min 4s, sys: 2min 52s, total: 15min 56s
Wall time: 15min 55s
Candidate Set (Hardware Training Candidates) contains 371344 Candidates
Extracting Candidates from Corpus (Hardware Development)
CPU times: user 3min 37s, sys: 1min 32s, total: 5min 10s
Wall time: 5min 9s
Candidate Set (Hardware Development Candidates) contains 52468 Candidates


### Assess Recall

In [7]:
from snorkel.models import Candidate
from hardware_utils import entity_level_total_recall

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'dc_gain_min', relation=True)

Preparing candidates...
Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 10993 
Entity-level Gold: 1366
Intersection Candidates: 1197
----------------------------------------
Overlap with Gold:  0.8763



In [8]:
print len(tp), len(fp), len(fn)

1197 9796 169


In [9]:
from pprint import pprint
pprint(sorted(list(fn))[:30])

[('2N6426-D', '2N6426', '20000'),
 ('2N6426-D', '2N6426', '30000'),
 ('2N6426-D', '2N6426G', '20000'),
 ('2N6426-D', '2N6426G', '30000'),
 ('2N6426-D', '2N6426RLRAG', '20000'),
 ('2N6426-D', '2N6426RLRAG', '30000'),
 ('2N6426-D', '2N6427', '10000'),
 ('2N6426-D', '2N6427', '14000'),
 ('2N6426-D', '2N6427', '20000'),
 ('2N6426-D', '2N6427G', '10000'),
 ('2N6426-D', '2N6427G', '14000'),
 ('2N6426-D', '2N6427G', '20000'),
 ('2N6426-D', '2N6427RLRAG', '10000'),
 ('2N6426-D', '2N6427RLRAG', '14000'),
 ('2N6426-D', '2N6427RLRAG', '20000'),
 ('2N6427', '2N6427', '10000'),
 ('2N6427', '2N6427', '14000'),
 ('2N6427', '2N6427', '20000'),
 ('2N6427', 'MMBT6427', '10000'),
 ('2N6427', 'MMBT6427', '14000'),
 ('2N6427', 'MMBT6427', '20000'),
 ('BC337', 'BC337', '60'),
 ('BC337', 'BC337-16', '60'),
 ('BC337', 'BC337-25', '160'),
 ('BC337', 'BC337-40', '170'),
 ('BC337', 'BC337-40', '250'),
 ('BC337', 'BC33716BU', '60'),
 ('BC337', 'BC33716TA', '60'),
 ('BC337', 'BC33716TFR', '60'),
 ('BC337', 'BC3372

In [10]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [11]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ candidates snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Gain = candidate_subclass('Part_Gain', ['part','gain'])

In [12]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           'dc_gain_min')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

Loading 371344 candidate labels
CPU times: user 6min 41s, sys: 5.06 s, total: 6min 46s
Wall time: 6min 46s
33229/371344 Candidates in Candidate Set (Hardware Training Candidates) have positive Labels
Loading 52468 candidate labels
CPU times: user 1min 13s, sys: 996 ms, total: 1min 14s
Wall time: 1min 13s
4836/52468 Candidates in Candidate Set (Hardware Development Candidates) have positive Labels


In [13]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [14]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ labels snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Gain = candidate_subclass('Part_Gain', ['part','gain'])

### Extract Features

In [15]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 109404366 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 1h 38min 33s, sys: 18min 58s, total: 1h 57min 31s
Wall time: 1h 57min 17s
Bulk upserting 12881640 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 22min 25s, sys: 2min 57s, total: 25min 23s
Wall time: 25min 18s


In [16]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel.models import candidate_subclass
Part_Gain = candidate_subclass('Part_Gain', ['part','gain'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [3]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

# POSITIVE
def LF_same_row(c):
    return 1 if same_row(c) else -1
LFs.append(LF_same_row)

def LF_dc_current_gain_row(c):
    return 1 if overlap(['dc','current','gain'], get_row_ngrams(c.gain, infer=True)) else 0
LFs.append(LF_dc_current_gain_row)

def LF_h_fe_row(c):
    return 1 if len(set(['h','FE']).intersection(
                    set(get_row_ngrams(c.gain, infer=True, lower=False)))) == 2 else 0
LFs.append(LF_h_fe_row)

def LF_not_relevant_aligned(c):
    return -1 if len(set(['dc','current','gain','h','fe']).intersection(
                     set(get_aligned_ngrams(c.gain)))) == 0 else 0
LFs.append(LF_not_relevant_aligned)

def LF_start_range(c):
    return 1 if overlap(['~','-'], get_right_ngrams(c.gain)) else 0
LFs.append(LF_start_range)

def LF_end_range(c):
    return -1 if overlap(['~','-'], get_left_ngrams(c.gain)) else 0
LFs.append(LF_end_range)

def LF_min_aligned(c):
    return 1 if 'min' in get_col_ngrams(c.gain) else 0
LFs.append(LF_min_aligned)

# TODO: LF where if gain aligned with A and part ends with A, return 1

### Apply LFs

In [4]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Bulk upserting 2599408 annotations...
Done.
Loading sparse Label matrix...
CPU times: user 12min 45s, sys: 1min 35s, total: 14min 20s
Wall time: 14min 18s


<371344x7 sparse matrix of type '<type 'numpy.float64'>'
	with 756684 stored elements in Compressed Sparse Row format>

### Assess LF accuracy

In [5]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

CPU times: user 912 ms, sys: 116 ms, total: 1.03 s
Wall time: 941 ms


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_same_row,0.808361,0.51068,1.0,0,0.608748
LF_dc_current_gain_row,0.103034,0.336769,0.418911,1,0.418911
LF_h_fe_row,0.146085,0.328426,0.390929,2,0.390929
LF_not_relevant_aligned,0.959196,0.065788,0.076886,3,0.076886
LF_start_range,0.466179,0.025486,0.025758,4,0.025758
LF_end_range,0.982394,0.03671,0.03671,5,0.03671
LF_min_aligned,0.18571,0.085805,0.088497,6,0.088497


In [6]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [9]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Gain = candidate_subclass('Part_Gain', ['part','gain'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

from snorkel.annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 17min 23s, sys: 2min 31s, total: 19min 54s
Wall time: 19min 46s
CPU times: user 9min 26s, sys: 2min 19s, total: 11min 46s
Wall time: 11min 44s


In [7]:
%%time
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	371344
Features:			7
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.292334
	Learning epoch = 250	Gradient mag. = 0.276798
	Learning epoch = 500	Gradient mag. = 0.346478
	Learning epoch = 750	Gradient mag. = 0.420799
	Learning epoch = 1000	Gradient mag. = 0.465599
	Learning epoch = 1250	Gradient mag. = 0.489273
	Learning epoch = 1500	Gradient mag. = 0.504379
	Learning epoch = 1750	Gradient mag. = 0.517095
	Learning epoch = 2000	Gradient mag. = 0.528635
	Learning epoch = 2250	Gradient mag. = 0.483685
	Learning epoch = 2500	Gradient mag. = 0.375717
	Learning epoch = 2750	Gradient mag. = 0.321232
	Learning epoch = 3000	Gradient mag. = 0.296672
	Learning epoch = 3250	Gradient mag. = 0.286089
	Learning epoch = 3500	Gradient mag. = 0.281712
	Learning epoch = 3750	Gradient mag. = 0.280131
	Learning epoch = 4000	Gradient mag. = 0.227506
	Learning epoch = 4250	Gradient mag. = 0.197818
	Learning epoch = 4500	Gradient mag. = 0.186956
	Lear

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-5)
# %time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	371344
Features:			34059
Using gradient descent...
	Learning epoch = 0	Step size = 0.0001
	Loss = 257396.046618	Gradient magnitude = 1104114.928628
	Learning epoch = 100	Step size = 9.04792147114e-05
	Loss = 712353.206978	Gradient magnitude = 239436.200121
	Learning epoch = 200	Step size = 8.18648829479e-05
	Loss = 2011944.117405	Gradient magnitude = 768963.472663
	Learning epoch = 300	Step size = 7.40707032156e-05
	Loss = 672070.454878	Gradient magnitude = 252215.671752
	Learning epoch = 400	Step size = 6.70185906007e-05
	Loss = 589733.507116	Gradient magnitude = 206410.920716
	Learning epoch = 500	Step size = 6.06378944861e-05
	Loss = 1756079.197614	Gradient magnitude = 752907.157809
	Learning epoch = 600	Step size = 5.48646907485e-05
	Loss = 1774683.819749	Gradient magnitude = 619520.367906
	Learning epoch = 700	Step size = 4.96411413431e-05
	Loss = 642508.921991	Gradient magnitude = 218669.300936
	Learning epoch = 800	Step size = 4.4914914861e-05
	Loss 

In [11]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [12]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [13]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [14]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.927626137304
Neg. class accuracy: 0.101528384279
Corpus Precision 0.0949
Corpus Recall    0.928
Corpus F1        0.172
----------------------------------------
TP: 4486 | FP: 42796 | TN: 4836 | FN: 350

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.927626137304
Neg. class accuracy: 0.101528384279
Corpus Precision 0.0949
Corpus Recall    0.928
Corpus F1        0.172
----------------------------------------
TP: 4486 | FP: 42796 | TN: 4836 | FN: 350





In [15]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'dc_gain_min')

Scoring on Entity-Level Gold Data
Corpus Precision 0.1
Corpus Recall    0.682
Corpus F1        0.175
----------------------------------------
TP: 137 | FP: 1229 | FN: 64



In [None]:
from pprint import pprint
FN_list = sorted(list(FN))
pprint(FN_list)

In [None]:
from hardware_utils import entity_to_candidates

entity = FN_list[0]
print entity
print

matches = entity_to_candidates(entity, fn)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate
print

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

In [None]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');