# polarity

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=3)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
from snorkel.models import candidate_subclass

Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

### Define Matchers

In [3]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

polarity_matcher = RegexMatchSpan(rgx=r'(pnp|npn)', ignore_case=True)

### Define ContextSpaces

In [4]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, get_gold_dict
from collections import defaultdict
import os

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])

part_ngrams = OmniNgramsPart(n_max=3, parts_by_doc=None) # NOTE: linking is turned off
polarity_ngrams = OmniNgrams(n_max=1)

### Run CandidateExtractor

In [5]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Polarity, 
                        [part_ngrams, polarity_ngrams], 
                        [parts_matcher, polarity_matcher], 
                        throttler=None)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Training)
CPU times: user 37.4 s, sys: 407 ms, total: 37.8 s
Wall time: 38 s
Candidate Set (Hardware Training Candidates) contains 15680 Candidates
Extracting Candidates from Corpus (Hardware Development)
CPU times: user 7.64 s, sys: 46.1 ms, total: 7.69 s
Wall time: 7.72 s
Candidate Set (Hardware Development Candidates) contains 1757 Candidates


### Assess Recall

In [6]:
from snorkel.models import Candidate
from hardware_utils import entity_level_total_recall

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'polarity', relation=True)

Preparing candidates...
Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1483 
Entity-level Gold: 831
Intersection Candidates: 699
----------------------------------------
Overlap with Gold:  0.8412



In [7]:
from pprint import pprint
pprint(fn)
print len(fn)

set([('BC337', 'BC337-16', 'NPN'),
     ('BC337', 'BC337-25', 'NPN'),
     ('BC337', 'BC337-40', 'NPN'),
     ('BC337', 'BC338-25', 'NPN'),
     ('BC337-D', 'BC337-025G', 'NPN'),
     ('BC337-D', 'BC337-040G', 'NPN'),
     ('BC337-D', 'BC337-25RL1G', 'NPN'),
     ('BC337-D', 'BC337-25RLRAG', 'NPN'),
     ('BC337-D', 'BC337-25ZL1G', 'NPN'),
     ('BC337-D', 'BC337-40RL1G', 'NPN'),
     ('BC337-D', 'BC337-40ZL1G', 'NPN'),
     ('BC547', 'BC546BCTA', 'NPN'),
     ('BC547', 'BC548BBU', 'NPN'),
     ('BC550', 'BC546A', 'NPN'),
     ('BC550', 'BC546B', 'NPN'),
     ('BC550', 'BC546C', 'NPN'),
     ('BC550', 'BC547A', 'NPN'),
     ('BC550', 'BC547B', 'NPN'),
     ('BC550', 'BC547C', 'NPN'),
     ('BC550', 'BC548A', 'NPN'),
     ('BC550', 'BC548B', 'NPN'),
     ('BC550', 'BC548C', 'NPN'),
     ('BC550', 'BC549A', 'NPN'),
     ('BC550', 'BC549B', 'NPN'),
     ('BC550', 'BC549C', 'NPN'),
     ('BC550', 'BC550A', 'NPN'),
     ('BC550', 'BC550B', 'NPN'),
     ('BC550', 'BC550C', 'NPN'),
     ('BC8

In [8]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [None]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

In [9]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           'polarity')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

Loading 15680 candidate labels
CPU times: user 39.1 s, sys: 161 ms, total: 39.3 s
Wall time: 39.4 s
12222/15680 Candidates in Candidate Set (Hardware Training Candidates) have positive Labels
Loading 1757 candidate labels
CPU times: user 8.39 s, sys: 58 ms, total: 8.45 s
Wall time: 8.51 s
1311/1757 Candidates in Candidate Set (Hardware Development Candidates) have positive Labels


In [10]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

### Assess Baseline Performance

In [14]:
# from snorkel.models import Corpus, Candidate
# from snorkel.utils import get_ORM_instance
# from snorkel.lf_helpers import *
# from hardware_utils import entity_level_f1
# from itertools import chain

# def LF_all_true(c):
#     return True

# def LF_complement_nearby(c):
#     return 1 if set(['complement','complementary']).isdisjoint(
#         chain.from_iterable(
#             [get_phrase_ngrams(c.part), 
#              get_phrase_ngrams(c.polarity),
#              get_neighbor_phrase_ngrams(c.polarity)])) else 0

# candidates = session.query(Candidate).all()
# (tp, fp, tn, fn) = baseline(candidates, LF_complement_nearby)

# corpus = get_ORM_instance(Corpus, session, 'Hardware')
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# (TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

In [15]:
# from pprint import pprint
# pprint(FP)

In [None]:
# pprint(FN)

In [None]:
# from hardware_utils import entity_to_candidates

# print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
# print c

In [None]:
# from snorkel.models import ImplicitSpan
# from snorkel.lf_helpers import *
# phrases = []
# print list(get_neighbor_phrase_ngrams(c.polarity))
# print c.polarity.parent
# print phrases
# print "SPAN_TYPE_[%s]" % ('IMPLICIT' if isinstance(c.polarity, ImplicitSpan) else 'EXPLICIT')
# # list(get_phrase_ngrams(c.polarity))

## Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

### Extract Features

In [16]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 1629750 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 4min 47s, sys: 1min 15s, total: 6min 2s
Wall time: 6min 7s
Bulk upserting 148318 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 1min, sys: 20.3 s, total: 1min 20s
Wall time: 1min 22s


In [17]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ featurized snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [18]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

def LF_complement_phrase_part(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.part)) else 0
LFs.append(LF_complement_phrase_part)

def LF_complement_phrase_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_phrase_polarity)

def LF_complement_neighbor_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_neighbor_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_neighbor_polarity)

def LF_top_mark_col_part(c):
    return -1 if overlap(['top','mark'],
                         get_col_ngrams(c.part)) else 0
LFs.append(LF_top_mark_col_part)

def LF_endswith_D_part(c):
    return -1 if c.part.get_span().endswith('D') else 0
LFs.append(LF_endswith_D_part)

def LF_default_positive(c):
    return 1 if not overlap(['complement','complementary'],
                            chain.from_iterable([
                            get_phrase_ngrams(c.part),
                            get_phrase_ngrams(c.polarity),
                            get_neighbor_phrase_ngrams(c.polarity)])) else 0
LFs.append(LF_default_positive)

In [19]:
from snorkel.models import Candidate
from hardware_utils import entity_to_candidates

candidates = session.query(Candidate).all()
print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
c = candidates[0]
print candidates[0]

print LF_complement_phrase_part(c)
print LF_complement_phrase_polarity(c)
print LF_complement_neighbor_polarity(c)
print LF_top_mark_col_part(c)
print LF_endswith_D_part(c)
print LF_default_positive(c)

17437
Part_Polarity(Span("BC846W-REV", parent=8214, chars=[17,26], words=[3,3]), Span("NPN", parent=73798, chars=[0,2], words=[0,0]))
0
0
0
0
0
1


### Apply LFs

In [20]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Bulk upserting 94080 annotations...
Done.
Loading sparse Label matrix...
CPU times: user 2min 25s, sys: 39.4 s, total: 3min 4s
Wall time: 3min 5s


<15680x6 sparse matrix of type '<type 'numpy.float64'>'
	with 15801 stored elements in Compressed Sparse Row format>

### Assess LF accuracy

In [21]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

CPU times: user 379 ms, sys: 17.8 ms, total: 397 ms
Wall time: 422 ms


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_complement_phrase_part,0.993289,0.0,0.028508,0,0.002742
LF_complement_phrase_polarity,1.0,0.0,0.055421,1,0.002551
LF_complement_neighbor_polarity,0.921986,0.0,0.008992,2,0.000191
LF_top_mark_col_part,0.970588,0.004337,0.004337,3,0.004337
LF_endswith_D_part,1.0,0.000638,0.000638,4,0.000638
LF_default_positive,0.855741,0.004974,0.909821,5,0.004974


In [22]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [None]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Development Candidates').one()

# from snorkel.annotations import FeatureManager, LabelManager
# feature_manager = FeatureManager()
# %time F_train = feature_manager.load(session, train, 'Train Features')
# %time F_dev = feature_manager.load(session, dev, 'Train Features')

# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')

In [23]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	15680
Features:			6
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.007403
	Learning epoch = 250	Gradient mag. = 0.007699
	Learning epoch = 500	Gradient mag. = 0.007608
	Learning epoch = 750	Gradient mag. = 0.007519
	Learning epoch = 1000	Gradient mag. = 0.007432
	Learning epoch = 1250	Gradient mag. = 0.007348
	Learning epoch = 1500	Gradient mag. = 0.007265
	Learning epoch = 1750	Gradient mag. = 0.007184
	Learning epoch = 2000	Gradient mag. = 0.007104
	Learning epoch = 2250	Gradient mag. = 0.007027
	Learning epoch = 2500	Gradient mag. = 0.006951
	Learning epoch = 2750	Gradient mag. = 0.006878
	Learning epoch = 3000	Gradient mag. = 0.006805
	Learning epoch = 3250	Gradient mag. = 0.006735
	Learning epoch = 3500	Gradient mag. = 0.006666
	Learning epoch = 3750	Gradient mag. = 0.006598
	Learning epoch = 4000	Gradient mag. = 0.006532
	Learning epoch = 4250	Gradient mag. = 0.006468
	Learning epoch = 4500	Gradient mag. = 0.006405
	Learn

In [24]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-5)
%time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	15680
Features:			18400
Using gradient descent...
	Learning epoch = 0	Step size = 1e-05
	Loss = 10868.547791	Gradient magnitude = 4199.468201
	Learning epoch = 100	Step size = 9.04792147114e-06
	Loss = 10477.036441	Gradient magnitude = 255.533295
	Learning epoch = 200	Step size = 8.18648829479e-06
	Loss = 10445.299212	Gradient magnitude = 148.660859
	Learning epoch = 300	Step size = 7.40707032156e-06
	Loss = 10433.086879	Gradient magnitude = 110.415884
	Learning epoch = 400	Step size = 6.70185906007e-06
	Loss = 10426.461924	Gradient magnitude = 90.703526
	Learning epoch = 500	Step size = 6.06378944861e-06
	Loss = 10422.276785	Gradient magnitude = 78.691106
	Learning epoch = 600	Step size = 5.48646907485e-06
	Loss = 10419.394211	Gradient magnitude = 70.656404
	Learning epoch = 700	Step size = 4.96411413431e-06
	Loss = 10417.296789	Gradient magnitude = 64.933033
	Learning epoch = 800	Step size = 4.4914914861e-06
	Loss = 10415.713135	Gradient magnitude = 60.70

In [25]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [26]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [27]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [28]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.906178489703
Neg. class accuracy: 0.630044843049
Corpus Precision 0.878
Corpus Recall    0.906
Corpus F1        0.892
----------------------------------------
TP: 1188 | FP: 165 | TN: 281 | FN: 123

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.906178489703
Neg. class accuracy: 0.630044843049
Corpus Precision 0.878
Corpus Recall    0.906
Corpus F1        0.892
----------------------------------------
TP: 1188 | FP: 165 | TN: 281 | FN: 123





In [29]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

Scoring on Entity-Level Gold Data
Corpus Precision 0.702
Corpus Recall    0.827
Corpus F1        0.759
----------------------------------------
TP: 134 | FP: 57 | FN: 28



In [31]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list[:10])

[(u'2N3906-D', u'1N916', u'PNP'),
 (u'2N3906-D', u'2N390D', u'PNP'),
 (u'2N4124', u'1N916', u'NPN'),
 (u'BC546-BC548C(TO-92)', u'BC546 THRU', u'NPN'),
 (u'BC546-BC548C(TO-92)', u'BC546-BC548C', u'NPN'),
 (u'BC546-D', u'BC548A', u'NPN'),
 (u'CSEMS05383-1', u'BC817 AND', u'NPN'),
 (u'DIODS13249-1', u'BC808', u'NPN'),
 (u'DIODS13249-1', u'BC81840', u'NPN'),
 (u'INFINEON-BC857SERIES_BC858SERIES_BC859SERIES_BC860SERIES-DS-V01_01-EN',
  u'BC850',
  u'PNP')]


In [34]:
from hardware_utils import entity_to_candidates

N = 1
for i in range(N):
    entity = FP_list[i]
    matches = entity_to_candidates(entity, fp)
    print "Entity: (%d matches)" % len(matches)
    print entity

    candidate = matches[0]
    print "\nCandidate:"
    print candidate

    print "\nScore:"
    print disc_model.get_candidate_score(candidate, F_dev)

    print "\nFeatures:"
    pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

Entity: (2 matches)
(u'2N3906-D', u'1N916', u'PNP')

Candidate:
Part_Polarity(Span("1N916", parent=525, chars=[0,4], words=[0,0]), Span("PNP", parent=461, chars=[38,40], words=[6,6]))

Score:
0.190055360188

Features:
[(AnnotationKey (TAB_e2_RIGHT_WORDS_[silicon]), 0.08343734205289861),
 (AnnotationKey (TAB_e2_HTML_TAG_html), 0.08047433345965031),
 (AnnotationKey (TAB_e2_CONTAINS_WORDS_[PNP]), -0.04038806465249167),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[,]), -0.03235906763952302),
 (AnnotationKey (TDL_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None]),
  -0.030354728294559805),
 (AnnotationKey (TDL_BETWEEN-MENTION-and-MENTION[LEN:0-2]),
  -0.028146930059021587),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[general purpose]), 0.02406298007207436),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[general]), 0.02406298007207436),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[purpose]), 0.02406298007207436),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[transistors]), 0.017114837857083884),
 (AnnotationKey (TAB_e2_LEFT_WORDS_[2n390

In [35]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)

Doc: Document 2N3906-D
------------
Part:
Span("1N916", parent=525, chars=[0,4], words=[0,0])
------------
Table: None
Phrase: Phrase(Doc: 2N3906-D, Table: X, Row: (None, None), Col: (None, None), Position: 0, Text: 1N916                             C S   < 4 pF*)
------------
Attr:
Span("PNP", parent=461, chars=[38,40], words=[6,6])
------------
Table: None
Phrase: Phrase(Doc: 2N3906-D, Table: X, Row: (None, None), Col: (None, None), Position: 0, Text: 2N3906 - General Purpose Transistors, PNP Silicon)
------------


In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');