# Extraction + Learning : Hardware Spec Sheets

This notebook demonstrates the full entity extraction process on transistor data sheets, extracting min storage temperatures.

In [None]:
%load_ext autoreload
%autoreload 2

### Extraction

In [1]:
from snorkel.parser import HTMLParser
html_parser = HTMLParser(path='data/hardware/hardware_docs/')

In [4]:
from snorkel.parser import TableParser
table_parser = TableParser()

In [5]:
from snorkel.parser import CorpusParser
cp = CorpusParser(html_parser, table_parser, max_docs=5)
%time corpus = cp.parse_corpus(name='Hardware Corpus')

0 documents parsed
1 documents parsed
2 documents parsed
3 documents parsed
4 documents parsed
CPU times: user 53.3 s, sys: 2.4 s, total: 55.7 s
Wall time: 1min 20s


In [7]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
ngrams = TableNgrams(n_max=3)

# Define a matcher
number_matcher = RangeMatcher(low=-80,high=-40)

In [11]:
# from snorkel.candidates import Candidates
# %time candidates = Candidates(ngrams, number_matcher, corpus.get_contexts())
# for c in candidates.get_candidates()[:5]: print c
    
from snorkel.candidates import CandidateExtractor
ce = CandidateExtractor(ngrams, number_matcher)
%time candidates = ce.extract(corpus.get_tables(), name='all')
for cand in candidates[:10]: print cand

CPU times: user 9.75 s, sys: 354 ms, total: 10.1 s
Wall time: 10 s
Ngram("-40", context=None, chars=[0,2], words=[0,0])
Ngram("-40", context=None, chars=[0,2], words=[0,0])
Ngram("-55", context=None, chars=[0,2], words=[0,0])
Ngram("-40", context=None, chars=[0,2], words=[0,0])
Ngram("-40", context=None, chars=[0,2], words=[0,0])
Ngram("-50", context=None, chars=[0,2], words=[0,0])
Ngram("-50", context=None, chars=[0,2], words=[0,0])
Ngram("-50", context=None, chars=[4,6], words=[2,2])
Ngram("-50", context=None, chars=[4,6], words=[2,2])
Ngram("-50", context=None, chars=[4,6], words=[2,2])


### Learning

First, generate gold data.

In [16]:
# read from csv
import csv
with open('data/hardware/gold_all.csv', 'rb') as csvfile:
    gold_reader = csv.reader(csvfile)
    gold = []
    for row in gold_reader:
        (doc, part, temp, label) = row
        if label=='stg_temp_min':
            gold.append((doc,temp))
gold = set(gold)
print "%s gold annotations" % len(gold)

# match with candidates
gt_dict = {}
for c in candidates:
    filename = (c.context.document.file).split('.')[0]
    temp = c.get_attrib_span('words')
    label = 1 if (filename, temp) in gold else -1
    gt_dict[c.id] = label
print "%s out of %s candidates are positive" % (gt_dict.values().count(1), len(gt_dict))

98 gold annotations
1 out of 1 candidates are positive


Then confirm that features work:

In [17]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer(candidates, corpus)
featurizer.get_features_by_id(candidates[0].id)[:10]

Extracting features...
Extracted 904 features for each of 101 candidates


[u'BASIC_NGRAM_-40',
 'TABLE_ROW_NUM_9',
 'TABLE_COL_NUM_2',
 'TABLE_HTML_TAG_td',
 'TABLE_HTML_ATTR_style=width:108pt;border-top-style:solid;border-top-width:1pt;border-left-style:solid;border-left-width:1pt;border-bottom-style:solid;border-bottom-width:1pt;border-right-style:solid;border-right-width:1pt',
 'TABLE_HTML_ANC_TAG_tr',
 'TABLE_HTML_ANC_TAG_table',
 'TABLE_HTML_ANC_TAG_body',
 'TABLE_HTML_ANC_ATTR_style=height:16pt',
 'TABLE_HTML_ANC_ATTR_style=border-collapse:collapse;margin-left:23.7102pt']

We now build a legacy DDLiteModel with which we will do learning.

In [18]:
from snorkel.snorkel import DDLiteModel
DDL = DDLiteModel(candidates, featurizer.get_features(), gt_dict)

DDL.set_holdout(idxs=range(50), validation_frac=0.5)
print "%s training data" % len(DDL.gt.get_labeled_ground_truth('training')[0])
print "%s test data" % len(DDL.gt.get_labeled_ground_truth('test')[0])
print "%s validation data" % len(DDL.gt.get_labeled_ground_truth('validation')[0])

AttributeError: 'Ngram' object has no attribute 'uid'

Define labeling functions:

In [None]:
def LF_to(m):
    return 1 if 'to' in m.post_window('words') else 0
def LF_storage(m):
    return 1 if 'storage' in m.aligned('words') else -1
def LF_tstg(m):
    return 1 if 'tstg' in m.aligned('words') else -1
def LF_temperature(m):
    return 1 if 'temperature' in m.aligned('words') else -1

In [None]:
LFs = [LF_to, LF_storage, LF_tstg, LF_temperature]
DDL.apply_lfs(LFs, clear=True)

In [None]:
print DDL.lf_names
DDL.print_lf_stats()

In [None]:
DDL.top_conflict_lfs(n=3)

In [None]:
DDL.lowest_coverage_lfs(n=3)

Now learn, baby, learn!

In [None]:
import matplotlib
import numpy as np
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)
mu_seq = np.ravel([1e-9, 1e-5, 1e-3, 1e-1])
lf_opts = {'sample': False, 'verbose': True}
model_opts = {'sample': False, 'n_iter': 3000, 'alpha': 0.5, 'mu': mu_seq, 'bias': False, 'verbose': True}
%time DDL.train_model(method='lr', lf_opts=lf_opts, model_opts=model_opts)

In [None]:
DDL.show_log()
idxs, gt = DDL.get_labeled_ground_truth(subset=DDL.holdout())
acc_feats = np.mean(DDL.get_predicted(subset=DDL.holdout()) == gt)
print "Full model accuracy: {:.3f}".format(acc_feats)

Tune in next time for relation extraction!