# dissipation baselines

## Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

### Define Matchers

In [None]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

dissipation_matcher = RegexMatchSpan(rgx=r'\d\d[05]')

### Define ContextSpaces

In [None]:
import os
from collections import defaultdict
from snorkel.candidates import OmniNgrams
from snorkel.lf_helpers import *
from hardware_utils import OmniNgramsPart, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])

# Baseline 1: all candidates, no splitting/linking, majority label
# part_ngrams        = OmniNgrams(n_max=1)
# dissipation_ngrams = OmniNgrams(n_max=1)
# throttler          = None

# Baseline 2: candidates w/ throttler, no splitting/linking, majority label
# part_ngrams        = OmniNgrams(n_max=1)
# dissipation_ngrams = OmniNgrams(n_max=1)
# throttler = lambda x: overlap(['power','dissipation','mw'], get_aligned_ngrams(x[1]))

# Current 1: candidates (w/ throttler?), splitting, no linking, supervised learning
# part_ngrams        = OmniNgramsPart(n_max=2)
# dissipation_ngrams = OmniNgrams(n_max=1)
# throttler = lambda x: overlap(['power','dissipation','mw'], get_aligned_ngrams(x[1]))

# Oracle 1: candidates (w/ throttler?), splitting/linking, supervised learning
part_ngrams        = OmniNgramsPart(n_max=2, parts_by_doc=parts_by_doc)
dissipation_ngrams = OmniNgrams(n_max=1)
throttler = lambda x: overlap(['power','dissipation','mw'], get_aligned_ngrams(x[1]))

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Dissipation, 
                        [part_ngrams, dissipation_ngrams], 
                        [parts_matcher, dissipation_matcher], 
                        throttler=throttler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

### Assess Baseline Performance

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ candidates snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

If we need to determine majority label:

In [None]:
# from snorkel.models import CandidateSet
# from snorkel.utils import get_ORM_instance
# from hardware_utils import candidates_to_entities, count_labels

# # map train candidates to entities
# train_candidates = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
# train_entities   = candidates_to_entities(train_candidates)

# # get majority label for training set
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# gold_entities = get_gold_dict(gold_file, attrib='dev_dissipation')
# T, F = count_labels()
# print "True: %d, False: %d" % (T, F)

In [None]:
# from snorkel.models import Corpus, CandidateSet
# from hardware_utils import entity_level_f1

# dev_corpus = get_ORM_instance(Corpus, session, 'Hardware Development')
# dev_candidates = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')
# # dev_entities   = candidates_to_entities(dev_candidates)

# # consider all candidates as True
# tp = set([c for c in dev_candidates])
# fp = set()
# tn = set()
# fn = set()
# TP, FP, FN = entity_level_f1(tp, fp, tn, fn, gold_file, dev_corpus, 'dev_dissipation')

## Gold Labels

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ candidates snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

In [None]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           'dev_dissipation')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [None]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ labels snorkel.db');

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

### Extract Features

In [None]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ featurized snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [None]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

def LF_complement_phrase_part(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.part)) else 0
LFs.append(LF_complement_phrase_part)

def LF_complement_phrase_dissipation(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.dissipation)) else 0
LFs.append(LF_complement_phrase_dissipation)

def LF_complement_neighbor_dissipation(c):
    return -1 if overlap(['complement','complementary'], 
                         get_neighbor_phrase_ngrams(c.dissipation)) else 0
LFs.append(LF_complement_neighbor_dissipation)

def LF_top_mark_col_part(c):
    return -1 if overlap(['top','mark'],
                         get_col_ngrams(c.part)) else 0
LFs.append(LF_top_mark_col_part)

def LF_endswith_D_part(c):
    return -1 if c.part.get_span().endswith('D') else 0
LFs.append(LF_endswith_D_part)

def LF_default_positive(c):
    return 1 if not overlap(['complement','complementary'],
                            chain.from_iterable([
                            get_phrase_ngrams(c.part),
                            get_phrase_ngrams(c.dissipation),
                            get_neighbor_phrase_ngrams(c.dissipation)])) else 0
LFs.append(LF_default_positive)

In [None]:
from snorkel.models import Candidate
from hardware_utils import entity_to_candidates

candidates = session.query(Candidate).all()
print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
c = candidates[0]
print candidates[0]

print LF_complement_phrase_part(c)
print LF_complement_phrase_dissipation(c)
print LF_complement_neighbor_dissipation(c)
print LF_top_mark_col_part(c)
print LF_endswith_D_part(c)
print LF_default_positive(c)

### Apply LFs

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

### Assess LF accuracy

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel.models import candidate_subclass
Part_Dissipation = candidate_subclass('Part_Dissipation', ['part','dissipation'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

from snorkel.annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 10 s, sys: 399 ms, total: 10.4 s
Wall time: 10.4 s
CPU times: user 5.81 s, sys: 239 ms, total: 6.05 s
Wall time: 6.07 s
CPU times: user 374 ms, sys: 13.8 ms, total: 388 ms
Wall time: 404 ms


In [2]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	6147
Features:			6
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.010293
	Learning epoch = 250	Gradient mag. = 0.010638
	Learning epoch = 500	Gradient mag. = 0.010447
	Learning epoch = 750	Gradient mag. = 0.010262
	Learning epoch = 1000	Gradient mag. = 0.010082
	Learning epoch = 1250	Gradient mag. = 0.009907
	Learning epoch = 1500	Gradient mag. = 0.009737
	Learning epoch = 1750	Gradient mag. = 0.009571
	Learning epoch = 2000	Gradient mag. = 0.009410
	Learning epoch = 2250	Gradient mag. = 0.009253
	Learning epoch = 2500	Gradient mag. = 0.009101
	Learning epoch = 2750	Gradient mag. = 0.008953
	Learning epoch = 3000	Gradient mag. = 0.008809
	Learning epoch = 3250	Gradient mag. = 0.008669
	Learning epoch = 3500	Gradient mag. = 0.008532
	Learning epoch = 3750	Gradient mag. = 0.008400
	Learning epoch = 4000	Gradient mag. = 0.008271
	Learning epoch = 4250	Gradient mag. = 0.008145
	Learning epoch = 4500	Gradient mag. = 0.008023
	Learni

In [3]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-5)
%time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	6147
Features:			16102
Using gradient descent...
	Learning epoch = 0	Step size = 1e-05
	Loss = 4260.775719	Gradient magnitude = 298.721469
	Learning epoch = 100	Step size = 9.04792147114e-06
	Loss = 4254.700548	Gradient magnitude = 52.696830
	Learning epoch = 200	Step size = 8.18648829479e-06
	Loss = 4253.166191	Gradient magnitude = 37.789619
	Learning epoch = 300	Step size = 7.40707032156e-06
	Loss = 4252.403175	Gradient magnitude = 30.802047
	Learning epoch = 400	Step size = 6.70185906007e-06
	Loss = 4251.941239	Gradient magnitude = 26.911635
	Learning epoch = 500	Step size = 6.06378944861e-06
	Loss = 4251.629708	Gradient magnitude = 24.501080
	Learning epoch = 600	Step size = 5.48646907485e-06
	Loss = 4251.406324	Gradient magnitude = 22.897253
	Learning epoch = 700	Step size = 4.96411413431e-06
	Loss = 4251.239887	Gradient magnitude = 21.795142
	Learning epoch = 800	Step size = 4.4914914861e-06
	Loss = 4251.112795	Gradient magnitude = 21.018045
	Learning

In [4]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [5]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [6]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [7]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.147843942505
Neg. class accuracy: 0.899094437257
Corpus Precision 0.48
Corpus Recall    0.148
Corpus F1        0.226
----------------------------------------
TP: 72 | FP: 78 | TN: 695 | FN: 415

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.147843942505
Neg. class accuracy: 0.899094437257
Corpus Precision 0.48
Corpus Recall    0.148
Corpus F1        0.226
----------------------------------------
TP: 72 | FP: 78 | TN: 695 | FN: 415





In [8]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'dev_dissipation')

Scoring on Entity-Level Gold Data
Corpus Precision 0.406
Corpus Recall    0.195
Corpus F1        0.264
----------------------------------------
TP: 26 | FP: 38 | FN: 107



In [9]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list)

[(u'BC546_DIOTEC', u'BC546', u'100'),
 (u'BC546_DIOTEC', u'BC546', u'200'),
 (u'BC546_DIOTEC', u'BC547', u'100'),
 (u'BC546_DIOTEC', u'BC547', u'200'),
 (u'BC550', u'BC546', u'550'),
 (u'BC550', u'BC549', u'550'),
 (u'BC550', u'BC550', u'550'),
 (u'CSEMS05382-1', u'BC546', u'250'),
 (u'CSEMS05382-1', u'BC546A', u'250'),
 (u'CSEMS05382-1', u'BC546B', u'250'),
 (u'CSEMS05382-1', u'BC547', u'250'),
 (u'CSEMS05382-1', u'BC547A', u'250'),
 (u'CSEMS05382-1', u'BC547B', u'250'),
 (u'CSEMS05382-1', u'BC547C', u'250'),
 (u'CSEMS05382-1', u'BC548', u'250'),
 (u'CSEMS05382-1', u'BC548A', u'250'),
 (u'CSEMS05382-1', u'BC548B', u'250'),
 (u'CSEMS05382-1', u'BC548C', u'250'),
 (u'DISES00189-1', u'BC546', u'100'),
 (u'DISES00189-1', u'BC546', u'200'),
 (u'DISES00189-1', u'BC546XBK', u'100'),
 (u'DISES00189-1', u'BC546XBK', u'200'),
 (u'DISES00189-1', u'BC546XBK', u'500'),
 (u'DISES00189-1', u'BC547', u'100'),
 (u'DISES00189-1', u'BC547', u'200'),
 (u'DISES00192-1', u'BC807', u'200'),
 (u'DISES00192-1

In [10]:
from hardware_utils import entity_to_candidates

for i in range(1):
    entity = FP_list[i]
    print entity
    print

    matches = entity_to_candidates(entity, fp)
    # print "# Matches: %d" % len(matches)
    candidate = matches[0]
    # print candidate
    print

    print disc_model.get_candidate_score(candidate, F_dev)
    print
    pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

(u'BC546_DIOTEC', u'BC546', u'100')



IndexError: list index out of range

In [None]:
for ngram in candidate.part.get_attrib_tokens():
    print "CONTAINS_%s_[%s]" % ('words'.upper(), ngram)

In [None]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');

The End.