# polarity

## Setup

In [None]:
# If necessary:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
doc_parser = HTMLParser(path=docs_path)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

### Split Corpus

In [None]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=3)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

### Define Matchers

In [None]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

polarity_matcher = RegexMatchSpan(rgx=r'(pnp|npn)', ignore_case=True)

### Define ContextSpaces

In [None]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, get_gold_dict
from collections import defaultdict
import os

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
parts_by_doc = defaultdict(set)
for part in gold_parts:
    parts_by_doc[part[0]].add(part[1])

part_ngrams = OmniNgramsPart(n_max=3, parts_by_doc=parts_by_doc)
polarity_ngrams = OmniNgrams(n_max=1)

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Polarity, 
                        [part_ngrams, polarity_ngrams], 
                        [parts_matcher, polarity_matcher], 
                        throttler=None)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

### Assess Recall

In [None]:
from snorkel.models import Candidate
from hardware_utils import entity_level_total_recall

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'polarity', relation=True)

In [None]:
from pprint import pprint
pprint(fn)
print len(fn)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

In [2]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           'polarity')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

Loading 92047 candidate labels
CPU times: user 3min 41s, sys: 1.92 s, total: 3min 43s
Wall time: 3min 48s
86466/92047 Candidates in Candidate Set (Hardware Training Candidates) have positive Labels
Loading 22020 candidate labels
CPU times: user 55.2 s, sys: 397 ms, total: 55.6 s
Wall time: 56.2 s
19634/22020 Candidates in Candidate Set (Hardware Development Candidates) have positive Labels


In [3]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

### Assess Baseline Performance

In [17]:
from snorkel.models import Corpus, Candidate
from snorkel.utils import get_ORM_instance
from snorkel.lf_helpers import *
from hardware_utils import baseline, entity_level_f1
from itertools import chain

def LF_all_true(c):
    return True

def LF_complement_nearby(c):
    return 1 if set(['complement','complementary']).isdisjoint(
        chain.from_iterable(
            [get_phrase_ngrams(c.part), 
             get_phrase_ngrams(c.polarity),
             get_neighbor_phrase_ngrams(c.polarity)])) else 0

candidates = session.query(Candidate).all()
(tp, fp, tn, fn) = baseline(candidates, LF_complement_nearby)

corpus = get_ORM_instance(Corpus, session, 'Hardware')
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

Scoring on Entity-Level Gold Data
Corpus Precision 1.0
Corpus Recall    0.964
Corpus F1        0.982
----------------------------------------
TP: 801 | FP: 0 | FN: 30



In [18]:
from pprint import pprint
pprint(FP)

set([])


In [19]:
pprint(FN)

set([('CSEMS03485-1', '2N3905', 'PNP'),
     ('CSEMS03485-1', '2N3906', 'PNP'),
     ('SAMSD00049-518', 'BC546', 'NPN'),
     ('SAMSD00049-518', 'BC546A', 'NPN'),
     ('SAMSD00049-518', 'BC546B', 'NPN'),
     ('SAMSD00049-518', 'BC546C', 'NPN'),
     ('SAMSD00049-518', 'BC547', 'NPN'),
     ('SAMSD00049-518', 'BC547A', 'NPN'),
     ('SAMSD00049-518', 'BC547B', 'NPN'),
     ('SAMSD00049-518', 'BC547C', 'NPN'),
     ('SAMSD00049-518', 'BC548', 'NPN'),
     ('SAMSD00049-518', 'BC548A', 'NPN'),
     ('SAMSD00049-518', 'BC548B', 'NPN'),
     ('SAMSD00049-518', 'BC548C', 'NPN'),
     ('SAMSD00049-518', 'BC549', 'NPN'),
     ('SAMSD00049-518', 'BC549A', 'NPN'),
     ('SAMSD00049-518', 'BC549B', 'NPN'),
     ('SAMSD00049-518', 'BC549C', 'NPN'),
     ('SAMSD00049-518', 'BC550', 'NPN'),
     ('SAMSD00049-518', 'BC550A', 'NPN'),
     ('SAMSD00049-518', 'BC550B', 'NPN'),
     ('SAMSD00049-518', 'BC550C', 'NPN'),
     ('UTCLS02155-1', 'BC807', 'PNP'),
     ('UTCLS02155-1', 'BC807-16', 'PNP'),
    

In [7]:
from hardware_utils import entity_to_candidates

print len(candidates)
matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
c = matches[0]
print c

114067
Part_Polarity(ImplicitSpan("BC856", parent=104141, words=[0,0], position=[2]), Span("NPN", parent=22719, chars=[16,18], words=[6,6]))


In [22]:
from snorkel.models import ImplicitSpan
from snorkel.lf_helpers import *
phrases = []
print list(get_neighbor_phrase_ngrams(c.polarity))
print c.polarity.parent
print phrases
print "SPAN_TYPE_[%s]" % ('IMPLICIT' if isinstance(c.polarity, ImplicitSpan) else 'EXPLICIT')
# list(get_phrase_ngrams(c.polarity))

[u'complementary', u'types', u':', u'bc', u'846', u',', u'bc', u'847', u',', u'type']
Phrase(Doc: SIEMS01215-1, Table: X, Row: X, Col: X, Position: 0, Text: BC 849, BC 850 (NPN))
[]
SPAN_TYPE_[EXPLICIT]


## Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

### Extract Features

In [2]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 18469929 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 38min 50s, sys: 15min 51s, total: 54min 42s
Wall time: 59min 49s
Bulk upserting 3671296 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 9min 21s, sys: 3min 24s, total: 12min 45s
Wall time: 13min 45s


In [3]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [2]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

def LF_complement_phrase_part(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.part)) else 0
LFs.append(LF_complement_phrase_part)

def LF_complement_phrase_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_phrase_polarity)

def LF_complement_neighbor_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_neighbor_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_neighbor_polarity)

def LF_top_mark_col_part(c):
    return -1 if overlap(['top','mark'],
                         get_col_ngrams(c.part)) else 0
LFs.append(LF_top_mark_col_part)

def LF_endswith_D_part(c):
    return -1 if c.part.get_span().endswith('D') else 0
LFs.append(LF_endswith_D_part)

def LF_default_positive(c):
    return 1 if not overlap(['complement','complementary'],
                            chain.from_iterable([
                            get_phrase_ngrams(c.part),
                            get_phrase_ngrams(c.polarity),
                            get_neighbor_phrase_ngrams(c.polarity)])) else 0
LFs.append(LF_default_positive)

In [6]:
from snorkel.models import Candidate
from hardware_utils import entity_to_candidates

candidates = session.query(Candidate).all()
print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
c = candidates[0]
print candidates[0]

print LF_complement_phrase_part(c)
print LF_complement_phrase_polarity(c)
print LF_complement_neighbor_polarity(c)
print LF_top_mark_col_part(c)
print LF_endswith_D_part(c)
print LF_default_positive(c)

114067
Part_Polarity(ImplicitSpan("BC807-25", parent=6783, words=[0,0], position=[1]), Span("NPN", parent=6804, chars=[26,28], words=[2,2]))
0
-1
0
0
0
0


### Apply LFs

In [7]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Bulk upserting 552282 annotations...
Done.
Loading sparse Label matrix...
CPU times: user 15min 32s, sys: 2min, total: 17min 32s
Wall time: 17min 50s


<92047x6 sparse matrix of type '<type 'numpy.float64'>'
	with 92282 stored elements in Compressed Sparse Row format>

### Assess LF accuracy

In [8]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

CPU times: user 1.89 s, sys: 54.6 ms, total: 1.94 s
Wall time: 1.95 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_complement_phrase_part,0.0,0.0,4.3e-05,0,0.0
LF_complement_phrase_polarity,1.0,0.0,0.060632,1,0.0
LF_complement_neighbor_polarity,0.0,0.0,0.0005,2,0.0
LF_top_mark_col_part,0.0,0.002553,0.002553,3,0.002553
LF_endswith_D_part,,0.0,0.0,4,0.0
LF_default_positive,1.0,0.002553,0.938825,5,0.002553


In [9]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel.models import candidate_subclass
Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

from snorkel.annotations import FeatureManager, LabelManager
feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

CPU times: user 4min 35s, sys: 1min 35s, total: 6min 11s
Wall time: 7min 44s
CPU times: user 2min 29s, sys: 55.6 s, total: 3min 24s
Wall time: 3min 46s
CPU times: user 6.99 s, sys: 395 ms, total: 7.38 s
Wall time: 7.7 s


In [2]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

Training marginals (!= 0.5):	92047
Features:			6
Begin training for rate=0.01, mu=1e-06
	Learning epoch = 0	Gradient mag. = 0.003731
	Learning epoch = 250	Gradient mag. = 0.003894
	Learning epoch = 500	Gradient mag. = 0.003861
	Learning epoch = 750	Gradient mag. = 0.003829
	Learning epoch = 1000	Gradient mag. = 0.003796
	Learning epoch = 1250	Gradient mag. = 0.003764
	Learning epoch = 1500	Gradient mag. = 0.003733
	Learning epoch = 1750	Gradient mag. = 0.003701
	Learning epoch = 2000	Gradient mag. = 0.003671
	Learning epoch = 2250	Gradient mag. = 0.003640
	Learning epoch = 2500	Gradient mag. = 0.003610
	Learning epoch = 2750	Gradient mag. = 0.003580
	Learning epoch = 3000	Gradient mag. = 0.003550
	Learning epoch = 3250	Gradient mag. = 0.003521
	Learning epoch = 3500	Gradient mag. = 0.003492
	Learning epoch = 3750	Gradient mag. = 0.003464
	Learning epoch = 4000	Gradient mag. = 0.003435
	Learning epoch = 4250	Gradient mag. = 0.003407
	Learning epoch = 4500	Gradient mag. = 0.003380
	Learn

In [3]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

Training marginals (!= 0.5):	92047
Features:			27309
Using gradient descent...
	Learning epoch = 0	Step size = 0.0001
	Loss = 63802.118529	Gradient magnitude = 77319.261662
	Learning epoch = 100	Step size = 9.04792147114e-05
	Loss = 1445390.672085	Gradient magnitude = 359287.573540
	Learning epoch = 200	Step size = 8.18648829479e-05
	Loss = 837342.429578	Gradient magnitude = 205369.696101
	Learning epoch = 300	Step size = 7.40707032156e-05
	Loss = 174509.924880	Gradient magnitude = 131531.128523
	Learning epoch = 400	Step size = 6.70185906007e-05
	Loss = 784196.078070	Gradient magnitude = 199216.396786
	Learning epoch = 500	Step size = 6.06378944861e-05
	Loss = 436488.799274	Gradient magnitude = 138010.218687
	Learning epoch = 600	Step size = 5.48646907485e-05
	Loss = 457751.683677	Gradient magnitude = 173094.616450
	Learning epoch = 700	Step size = 4.96411413431e-05
	Loss = 400722.000411	Gradient magnitude = 174526.499424
	Learning epoch = 800	Step size = 4.4914914861e-05
	Loss = 8127

In [4]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [5]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [6]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

In [7]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

Calibration plot:
Recall-corrected Noise-aware Model
Pos. class accuracy: 0.979474381176
Neg. class accuracy: 0.622799664711
Corpus Precision 0.955
Corpus Recall    0.979
Corpus F1        0.967
----------------------------------------
TP: 19231 | FP: 900 | TN: 1486 | FN: 403

Recall-corrected Noise-aware Model
Pos. class accuracy: 0.979474381176
Neg. class accuracy: 0.622799664711
Corpus Precision 0.955
Corpus Recall    0.979
Corpus F1        0.967
----------------------------------------
TP: 19231 | FP: 900 | TN: 1486 | FN: 403





In [9]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

Scoring on Entity-Level Gold Data
Corpus Precision 0.902
Corpus Recall    1.0
Corpus F1        0.949
----------------------------------------
TP: 166 | FP: 18 | FN: 0



In [10]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list)

[(u'SIEMS01215-1', u'BC856', u'NPN'),
 (u'SIEMS01215-1', u'BC856A', u'NPN'),
 (u'SIEMS01215-1', u'BC856B', u'NPN'),
 (u'SIEMS01215-1', u'BC857', u'NPN'),
 (u'SIEMS01215-1', u'BC857A', u'NPN'),
 (u'SIEMS01215-1', u'BC857B', u'NPN'),
 (u'SIEMS01215-1', u'BC857C', u'NPN'),
 (u'SIEMS01215-1', u'BC858', u'NPN'),
 (u'SIEMS01215-1', u'BC858A', u'NPN'),
 (u'SIEMS01215-1', u'BC858B', u'NPN'),
 (u'SIEMS01215-1', u'BC858C', u'NPN'),
 (u'SIEMS01215-1', u'BC859', u'NPN'),
 (u'SIEMS01215-1', u'BC859A', u'NPN'),
 (u'SIEMS01215-1', u'BC859B', u'NPN'),
 (u'SIEMS01215-1', u'BC859C', u'NPN'),
 (u'SIEMS01215-1', u'BC860', u'NPN'),
 (u'SIEMS01215-1', u'BC860B', u'NPN'),
 (u'SIEMS01215-1', u'BC860C', u'NPN')]


In [11]:
from hardware_utils import entity_to_candidates

entity = FP_list[0]
print entity
print

matches = entity_to_candidates(entity, fp)
print "# Matches: %d" % len(matches)
candidate = matches[0]
print candidate
print

print disc_model.get_candidate_score(candidate, F_dev)
print
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

(u'SIEMS01215-1', u'BC856', u'NPN')

# Matches: 50
Part_Polarity(ImplicitSpan("BC856", parent=103821, words=[0,0], position=[2]), Span("NPN", parent=22719, chars=[16,18], words=[6,6]))

1.24856084328

[(AnnotationKey (TDL_SEQ-BETWEEN[LEN:0-4]), 0.3365735297766512),
 (AnnotationKey (TAB_e2_RIGHT_WORDS_[)]), -0.31749907934314336),
 (AnnotationKey (TAB_e2_NEIGHBOR_PHRASE_WORDS_[:]), -0.28240687125558395),
 (AnnotationKey (TAB_e2_NEIGHBOR_PHRASE_WORDS_[types]), -0.2063248517192647),
 (AnnotationKey (TAB_e1_CONTAINS_WORDS_[C]), 0.1848952365227314),
 (AnnotationKey (TAB_e1_CONTAINS_WORDS_[B]), 0.17607145312710265),
 (AnnotationKey (TAB_e2_CONTAINS_WORDS_[N]), 0.16468676088145986),
 (AnnotationKey (TAB_e1_HTML_ANC_TAG_[html]), 0.16468676088145986),
 (AnnotationKey (TAB_e2_CONTAINS_WORDS_[P]), 0.16468676088145986),
 (AnnotationKey (TAB_e2_HTML_ANC_TAG_[html]), 0.16468676088145986),
 (AnnotationKey (TAB_e2_SPAN_TYPE_[EXPLICIT]), 0.16468676088145986),
 (AnnotationKey (TAB_e1_COL_WORDS_[1]), -0.1

In [18]:
for ngram in candidate.part.get_attrib_tokens():
    print "CONTAINS_%s_[%s]" % ('words'.upper(), ngram)

CONTAINS_WORDS_[BC856]


In [None]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');