# C_CURRENT_MAX

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=4)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Current = candidate_subclass('Part_Current', ['part','current'])

### Define Matchers

In [None]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

#TODO: This is missing values represented as AMPs rather than milliamps
current_matcher = RegexMatchSpan(rgx=r'\-?([2-9]|1[0-2]?)00', longest_match_only=False)

### Define ContextSpaces

In [None]:
import os
from collections import defaultdict
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])
    
part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)

# TODO: This is missing the current represented as an Amp rather than a milliamp
current_ngrams = OmniNgramsTemp(n_max=3)

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import PartCurrentThrottler

part_throttler = PartCurrentThrottler()

ce = CandidateExtractor(Part_Current, 
                        [part_ngrams, current_ngrams], 
                        [parts_matcher, current_matcher],
                        part_throttler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

### Assess Recall

In [None]:
from snorkel.models import CandidateSet
from hardware_utils import entity_level_total_recall, most_common_document
from snorkel.models import Candidate

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'c_current_max', relation=True)
print len(tp)
print len(fp)
print len(fn)

In [None]:
from hardware_utils import entity_level_total_recall, most_common_document_candidates
max_doc = most_common_document_candidates(all_candidates)
print(max_doc)

In [None]:
from pprint import pprint
fns = list(fn)
pprint(sorted(fns[:50]))

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Current = candidate_subclass('Part_Current', ['part','current'])

In [None]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           attrib='c_current_max')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Current = candidate_subclass('Part_Current', ['part','current'])

### Extract Features

In [None]:
from snorkel.models import CandidateSet
from snorkel.annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Current = candidate_subclass('Part_Current', ['part','current'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [None]:
from snorkel.annotations import LabelManager
from snorkel.lf_helpers import *
label_manager = LabelManager()

LFs = []

# TODO: Get rid of generators

# POSITIVE

def LF_head_is_IC(c):
    head_ngrams = set(get_head_ngrams(c.current, axis='row'))
    return 1 if ('ic' in head_ngrams or
                 'i c' in head_ngrams) else 0
LFs.append(LF_head_is_IC)

def LF_current_inside_table(c):
    return 1 if c.current.parent.row is not None else 0
LFs.append(LF_current_inside_table)

def LF_continuous_left(c):
    return 1 if 'continuous' in get_left_ngrams(c.current, window=5) else 0
LFs.append(LF_continuous_left)

def LF_current_row(c):
    return 1 if 'current' in get_row_ngrams(c.current) else 0
LFs.append(LF_current_row)

def LF_collector_row(c):
    return 1 if 'collector' in get_row_ngrams(c.current) else 0
LFs.append(LF_collector_row)
    
def LF_continuous_row(c):
    return 1 if 'continuous' in get_row_ngrams(c.current) else 0
LFs.append(LF_continuous_row)
    
# def LF_collector_current_row(c):
#     row_ngrams = set(get_row_ngrams(c.current))
#     return 1 if ('collector current' in row_ngrams or 
#                  'continuous' in row_ngrams) else 0
# LFs.append(LF_collector_current_row)

def LF_current_unit_row(c):
    row_ngrams = set(get_row_ngrams(c.current))
    return 1 if (
        'ma'                in row_ngrams or
        'a'                 in row_ngrams) else 0
LFs.append(LF_current_unit_row)

def LF_one_number_row(c):
    num_numbers = list(get_row_ngrams(c.current, attrib="ner_tags")).count('number')
    return 1 if num_numbers == 1 else 0
LFs.append(LF_one_number_row)

# NEGATIVE

def LF_not_cc_relevant(c):
    ngrams = get_aligned_ngrams(c.current)
    return -1 if not ('collector' in ngrams or
                      'current' in ngrams) else 0
LFs.append(LF_not_cc_relevant)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.current, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

def LF_thermal_row(c):
    return -1 if 'thermal' in get_row_ngrams(c.current) else 0
LFs.append(LF_collector_row)

def LF_junction_row(c):
    return -1 if 'junction' in get_row_ngrams(c.current) else 0
LFs.append(LF_junction_row)

def LF_resistance_row(c):
    return -1 if 'resistance' in get_row_ngrams(c.current) else 0
LFs.append(LF_resistance_row)

def LF_small_signal_row(c):
    return -1 if 'small-signal' in get_row_ngrams(c.current) else 0
LFs.append(LF_small_signal_row)

def LF_ambient_row(c):
    return -1 if 'ambient' in get_row_ngrams(c.current) else 0
LFs.append(LF_ambient_row)

def LF_part_in_same_row(c):
    return -1 if c.part.parent.row == c.current.parent.row else 0
LFs.append(LF_part_in_same_row)

# def LF_collector_aligned(c):
#     ngrams = set(get_aligned_ngrams(c.current))
#     return -1 if (
#         'collector-base'    in ngrams or
#         'cut-off'           in ngrams or
#         'collector-emitter' in ngrams) else 0
# LFs.append(LF_collector_aligned)

# def LF_voltage_row_temp(c):
#     ngrams = set(get_aligned_ngrams(c.current))
#     return -1 if (
#         'voltage' in ngrams or
#         'cbo'     in ngrams or
#         'ceo'     in ngrams or
#         'ebo'     in ngrams or
#         'v'       in ngrams) else 0
# LFs.append(LF_voltage_row_temp)

# def LF_voltage_row_part(c):
#     ngrams = set(get_aligned_ngrams(c.current))
#     return -1 if (
#         'voltage' in ngrams or
#         'cbo'     in ngrams or
#         'ceo'     in ngrams or
#         'ebo'     in ngrams or
#         'v'       in ngrams) else 0
# LFs.append(LF_voltage_row_part)

# def LF_test_condition_row(c):
#     ngrams = set(get_row_ngrams(c.current))
#     return -1 if ('test'      in ngrams and
#                   'condition' in ngrams) else 0
# LFs.append(LF_test_condition_row)


### Apply LFs

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

### Assess LF accuracy

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Current = candidate_subclass('Part_Current', ['part','current'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

In [2]:
from snorkel.annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 2min 20s, sys: 3.31 s, total: 2min 23s
Wall time: 2min 23s
CPU times: user 1min 21s, sys: 2.09 s, total: 1min 23s
Wall time: 1min 23s
CPU times: user 1.44 s, sys: 36 ms, total: 1.48 s
Wall time: 1.47 s


In [3]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	68987
Features:			15
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.377300
	Learning epoch = 250	Gradient mag. = 0.797984
	Learning epoch = 500	Gradient mag. = 1.727767
	Learning epoch = 750	Gradient mag. = 2.146065
	Learning epoch = 1000	Gradient mag. = 1.515142
	Learning epoch = 1250	Gradient mag. = 1.041579
	Learning epoch = 1500	Gradient mag. = 0.744828
	Learning epoch = 1750	Gradient mag. = 0.553926
	Learning epoch = 2000	Gradient mag. = 0.428847
	Learning epoch = 2250	Gradient mag. = 0.345825
	Learning epoch = 2500	Gradient mag. = 0.289947
	Learning epoch = 2750	Gradient mag. = 0.251502
	Learning epoch = 3000	Gradient mag. = 0.224127
	Learning epoch = 3250	Gradient mag. = 0.203728
	Learning epoch = 3500	Gradient mag. = 0.187743
	Learning epoch = 3750	Gradient mag. = 0.174604
	Learning epoch = 4000	Gradient mag. = 0.163366
	Learning epoch = 4250	Gradient mag. = 0.153462
	Learning epoch = 4500	Gradient mag. = 0.144546
	Lear

In [4]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	68987
Features:			22013
Using gradient descent...
	Learning epoch = 0	Step size = 0.0001
	Loss = 47818.144545	Gradient magnitude = 268851.843848
	Learning epoch = 100	Step size = 9.04792147114e-05
	Loss = 0.091928	Gradient magnitude = 0.028110
	Learning epoch = 200	Step size = 8.18648829479e-05
	Loss = 0.091927	Gradient magnitude = 0.028110
	Learning epoch = 300	Step size = 7.40707032156e-05
	Loss = 0.091926	Gradient magnitude = 0.028110
	Learning epoch = 400	Step size = 6.70185906007e-05
	Loss = 0.091925	Gradient magnitude = 0.028110
	Learning epoch = 500	Step size = 6.06378944861e-05
	Loss = 0.091924	Gradient magnitude = 0.028110
	Learning epoch = 600	Step size = 5.48646907485e-05
	Loss = 0.091923	Gradient magnitude = 0.028110
	Learning epoch = 700	Step size = 4.96411413431e-05
	Loss = 0.091922	Gradient magnitude = 0.028110
	Learning epoch = 800	Step size = 4.4914914861e-05
	Loss = 0.091921	Gradient magnitude = 0.028110
	Learning epoch = 900	Step size = 4

In [5]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 1.0
Neg. class accuracy: 0.0
Corpus Precision 0.388
Corpus Recall    1.0
Corpus F1        0.559
----------------------------------------
TP: 8268 | FP: 13034 | TN: 0 | FN: 0

Recall-corrected Noise-aware Model
Pos. class accuracy: 1.0
Neg. class accuracy: 0.0
Corpus Precision 0.388
Corpus Recall    1.0
Corpus F1        0.559
----------------------------------------
TP: 8268 | FP: 13034 | TN: 0 | FN: 0





In [6]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'c_current_max')


Scoring on Entity-Level Gold Data
Corpus Precision 0.175
Corpus Recall    0.93
Corpus F1        0.295
----------------------------------------
TP: 186 | FP: 874 | FN: 14



In [7]:
from pprint import pprint
FN_list = sorted(list(FN))
FP_list = sorted(list(FP))
TP_list = sorted(list(TP))
pprint(FN_list)

[('MCCCS08818-1', 'DTC114YE', '100'),
 ('MCCCS08984-1', 'MMBT3904', '200'),
 ('MCCCS09741-1', 'BC337', '800'),
 ('MCCCS09741-1', 'BC337-16', '800'),
 ('MCCCS09741-1', 'BC337-25', '800'),
 ('MCCCS09741-1', 'BC337-40', '800'),
 ('MCCCS09741-1', 'BC338', '800'),
 ('MCCCS09741-1', 'BC338-16', '800'),
 ('MCCCS09741-1', 'BC338-25', '800'),
 ('MCCCS09741-1', 'BC338-40', '800'),
 ('MMMCS17742-1', 'BC807-16W', '500'),
 ('MMMCS17742-1', 'BC807-25W', '500'),
 ('MMMCS17742-1', 'BC807-40W', '500'),
 ('RECTS01214-1', 'DTC114EUA', '100')]


In [8]:
from hardware_utils import current_entity_to_candidates

entity = FN_list[2]
print entity
print

matches = current_entity_to_candidates(entity, fn)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate
print

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

('MCCCS09741-1', 'BC337', '800')

# Matches: 0


IndexError: list index out of range

In [None]:

import os
os.system('cp snorkel.db snorkel.db\ final');