# STG_TEMP_MAX

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# import os
# zip_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html.zip'
# os.system('unzip %s' % zip_file)

## Parsing

In [20]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [21]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [22]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

CPU times: user 4min 32s, sys: 18.4 s, total: 4min 50s
Wall time: 37min 19s


### Split Corpus

In [23]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=3)

78 Documents added to corpus Hardware Training
20 Documents added to corpus Hardware Development


In [24]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [None]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ corpus snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

In [25]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Define Matchers

In [26]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'1[4-6]0', longest_match_only=False)
# temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]', longest_match_only=False)

### Define ContextSpaces

In [27]:
import os
from collections import defaultdict
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])
    
part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)
temp_ngrams = OmniNgramsTemp(n_max=2)

### Define Candidate Throttler

In [28]:
# from hardware_utils import PartThrottler
# part_throttler = PartThrottler()
part_throttler = None

### Run CandidateExtractor

In [29]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Temp, [part_ngrams, temp_ngrams], [parts_matcher, temp_matcher], throttler=part_throttler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Training)
CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 36s
Candidate Set (Hardware Training Candidates) contains 125608 Candidates
Extracting Candidates from Corpus (Hardware Development)
CPU times: user 1min 4s, sys: 0 ns, total: 1min 4s
Wall time: 1min 3s
Candidate Set (Hardware Development Candidates) contains 54827 Candidates


### Assess Recall

In [30]:
from snorkel.models import Candidate
from hardware_utils import entity_level_total_recall

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'stg_temp_max', relation=True)

Preparing candidates...
Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1168 
Entity-level Gold: 831
Intersection Candidates: 831
----------------------------------------
Overlap with Gold:  1.0000



In [31]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ candidates snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [32]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           'stg_temp_max')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

Loading 125608 candidate labels
CPU times: user 4min 40s, sys: 1.47 s, total: 4min 41s
Wall time: 4min 41s
117700/125608 Candidates in Candidate Set (Hardware Training Candidates) have positive Labels
Loading 54827 candidate labels
CPU times: user 1min 53s, sys: 1.55 s, total: 1min 54s
Wall time: 1min 54s
42091/54827 Candidates in Candidate Set (Hardware Development Candidates) have positive Labels


In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Extract Features

In [2]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 23216491 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 24min 36s, sys: 3min 21s, total: 27min 58s
Wall time: 27min 55s
Bulk upserting 6975862 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 11min 29s, sys: 1min 25s, total: 12min 55s
Wall time: 12min 53s


In [3]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ featurized snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [4]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

# POSITIVE

def LF_to_left(c):
    return 1 if 'to' in get_left_ngrams(c.temp, window=2) else 0
LFs.append(LF_to_left)

def LF_negative_left(c):
    return 1 if any([re.match(r'-\s*\d+', ngram) for ngram in get_left_ngrams(c.temp, window=4)]) else 0
LFs.append(LF_negative_left)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_storage_row)

def LF_operating_row(c):
    return 1 if 'operating' in get_row_ngrams(c.temp) else 0
LFs.append(LF_operating_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else 0
LFs.append(LF_temperature_row)

def LF_tstg_row(c):
    return 1 if overlap(
        ['tstg','stg','ts'], 
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_tstg_row)

# NEGATIVE

def LF_not_temp_relevant(c):
    return -1 if not overlap(
        ['storage','temperature','tstg','ts','stg'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_not_temp_relevant)

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.row is None else 0
LFs.append(LF_temp_outside_table)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

def LF_collector_aligned(c):
    return -1 if overlap(
        ['collector', 'collector-current', 'collector-base', 'collector-emitter'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_collector_aligned)

def LF_current_aligned(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['current', 'dc', 'ic'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_current_aligned)

def LF_voltage_row_temp(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_temp)

def LF_voltage_row_part(c):
    ngrams = get_aligned_ngrams(c.part)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_part)

def LF_typ_row(c):
    return -1 if overlap(
        ['typ', 'typ.'],
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_typ_row)

def LF_test_condition_aligned(c):
    return -1 if overlap(
        ['test', 'condition'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_test_condition_aligned)

def LF_complement_left_row(c):
    return -1 if 'complement' in get_row_ngrams(c.part) else 0
LFs.append(LF_complement_left_row)

### Apply LFs

In [5]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Bulk upserting 2009728 annotations...
Done.
Loading sparse Label matrix...
CPU times: user 7min 53s, sys: 1min 49s, total: 9min 43s
Wall time: 9min 41s


<125608x16 sparse matrix of type '<type 'numpy.float64'>'
	with 369250 stored elements in Compressed Sparse Row format>

### Assess LF accuracy

In [6]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

CPU times: user 2.67 s, sys: 344 ms, total: 3.02 s
Wall time: 2.82 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_to_left,1.0,0.024943,0.099309,0,0.099309
LF_negative_left,1.0,0.047386,0.123121,1,0.123121
LF_storage_row,1.0,0.040595,0.150572,2,0.150572
LF_operating_row,1.0,0.017929,0.061214,3,0.061214
LF_temperature_row,1.0,0.06619,0.257794,4,0.179033
LF_tstg_row,1.0,0.025986,0.134028,5,0.134028
LF_not_temp_relevant,0.08533,3.2e-05,0.737811,6,0.662728
LF_temp_outside_table,0.038574,0.025683,0.446421,7,0.446421
LF_too_many_numbers_row,0.145155,0.002524,0.217741,8,0.217741
LF_collector_aligned,0.0,0.0,0.053675,9,0.053675


In [7]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [8]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Development Candidates').one()

# from snorkel.annotations import FeatureManager, LabelManager
# feature_manager = FeatureManager()
# %time F_train = feature_manager.load(session, train, 'Train Features')
# %time F_dev = feature_manager.load(session, dev, 'Train Features')

# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')

In [9]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	125608
Features:			16
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.452869
	Learning epoch = 250	Gradient mag. = 0.564128
	Learning epoch = 500	Gradient mag. = 0.630443
	Learning epoch = 750	Gradient mag. = 0.672689
	Learning epoch = 1000	Gradient mag. = 0.693796
	Learning epoch = 1250	Gradient mag. = 0.685008
	Learning epoch = 1500	Gradient mag. = 0.677330
	Learning epoch = 1750	Gradient mag. = 0.591285
	Learning epoch = 2000	Gradient mag. = 0.469344
	Learning epoch = 2250	Gradient mag. = 0.409590
	Learning epoch = 2500	Gradient mag. = 0.374805
	Learning epoch = 2750	Gradient mag. = 0.350799
	Learning epoch = 3000	Gradient mag. = 0.331325
	Learning epoch = 3250	Gradient mag. = 0.314159
	Learning epoch = 3500	Gradient mag. = 0.299941
	Learning epoch = 3750	Gradient mag. = 0.288118
	Learning epoch = 4000	Gradient mag. = 0.268550
	Learning epoch = 4250	Gradient mag. = 0.247818
	Learning epoch = 4500	Gradient mag. = 0.229963
	Lea

In [10]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	125608
Features:			22868
Using gradient descent...
	Learning epoch = 0	Step size = 0.0001
	Loss = 87064.831056	Gradient magnitude = 196215.375928
	Learning epoch = 100	Step size = 9.04792147114e-05
	Loss = 4768.075273	Gradient magnitude = 6981.204399
	Learning epoch = 200	Step size = 8.18648829479e-05
	Loss = 4400.244714	Gradient magnitude = 5537.460408
	Learning epoch = 300	Step size = 7.40707032156e-05
	Loss = 4241.282188	Gradient magnitude = 5031.927436
	Learning epoch = 400	Step size = 6.70185906007e-05
	Loss = 4114.580209	Gradient magnitude = 4236.232545
	Learning epoch = 500	Step size = 6.06378944861e-05
	Loss = 3997.399195	Gradient magnitude = 3067.108084
	Learning epoch = 600	Step size = 5.48646907485e-05
	Loss = 3896.164824	Gradient magnitude = 1020.319800
	Learning epoch = 700	Step size = 4.96411413431e-05
	Loss = 3879.900478	Gradient magnitude = 25.331688
	Learning epoch = 800	Step size = 4.4914914861e-05
	Loss = 3877.257707	Gradient magnitude = 

In [11]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [12]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [13]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [14]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.254567484735
Neg. class accuracy: 1.0
Corpus Precision 1.0
Corpus Recall    0.255
Corpus F1        0.406
----------------------------------------
TP: 10715 | FP: 0 | TN: 12736 | FN: 31376

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.254567484735
Neg. class accuracy: 1.0
Corpus Precision 1.0
Corpus Recall    0.255
Corpus F1        0.406
----------------------------------------
TP: 10715 | FP: 0 | TN: 12736 | FN: 31376





In [16]:
import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

In [17]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'stg_temp_max')

Scoring on Entity-Level Gold Data
Corpus Precision 1.0
Corpus Recall    1.0
Corpus F1        1.0
----------------------------------------
TP: 185 | FP: 0 | FN: 0



In [None]:
from pprint import pprint
FN_list = sorted(list(FN))
pprint(FN_list)

In [None]:
from hardware_utils import entity_to_candidates

entity = FN_list[0]
print entity
print

matches = entity_to_candidates(entity, fn)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate
print

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

In [None]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');