# Episode 2: The [Table] Clone Wars

This notebook demonstrates the full entity extraction process on transistor data sheets, extracting min/max storage temperatures.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
from snorkel.parser import HTMLParser
html_parser = HTMLParser(path='data/hardware_docs/')

In [101]:
from snorkel.parser import TableParser
table_parser = TableParser()

In [102]:
from snorkel.parser import Corpus
%time corpus = Corpus(html_parser, table_parser, max_docs=5)

Parsing documents...
Parsing contexts...
Parsed 5 documents and 54 contexts
CPU times: user 4.5 s, sys: 229 ms, total: 4.73 s
Wall time: 7.21 s


In [103]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
number_matcher = RangeMatcher(low=-80,high=0)
# number_matcher = NumberMatcher()

In [104]:
from snorkel.candidates import Candidates
%time candidates = Candidates(table_ngrams, number_matcher, corpus.get_contexts())
for c in candidates.get_candidates()[:10]: print c

Extracting candidates...
CPU times: user 402 ms, sys: 13 ms, total: 415 ms
Wall time: 413 ms
<TableNgram("0", id=3-6-6-0:36-36, chars=[36,36], (row,col)=(1,0), tag=td)
<TableNgram("0", id=4-3-26-0:16-16, chars=[16,16], (row,col)=(4,2), tag=td)
<TableNgram("0", id=4-3-8-0:19-19, chars=[19,19], (row,col)=(1,2), tag=td)
<TableNgram("-0.5", id=2-3-102-0:18-21, chars=[18,21], (row,col)=(23,2), tag=td)
<TableNgram("0", id=1-3-13-0:20-20, chars=[20,20], (row,col)=(2,1), tag=td)
<TableNgram("-1.0", id=2-3-9-0:4-7, chars=[4,7], (row,col)=(2,2), tag=td)
<TableNgram("-1.0", id=2-3-44-0:4-7, chars=[4,7], (row,col)=(9,0), tag=td)
<TableNgram("-10", id=2-3-113-0:17-19, chars=[17,19], (row,col)=(25,2), tag=td)
<TableNgram("-10", id=2-3-67-0:4-6, chars=[4,6], (row,col)=(15,2), tag=td)
<TableNgram("-50", id=2-3-35-0:0-2, chars=[0,2], (row,col)=(6,4), tag=td)


In [105]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer(candidates, corpus)

Extracting features...
Extracted 1277 features for each of 83 mentions


In [106]:
featurizer.get_features_by_id(candidates.get_candidates()[0].id)

['Ngram_features_to_come',
 'TABLE_ROW_NUM_1',
 'TABLE_COL_NUM_0',
 'TABLE_HTML_TAG_td',
 'TABLE_HTML_ATTR_colspan=2',
 'TABLE_HTML_ATTR_style=width:325pt;border-top-style:solid;border-top-width:1pt;border-left-style:solid;border-left-width:1pt;border-bottom-style:solid;border-bottom-width:1pt;border-right-style:solid;border-right-width:1pt',
 'TABLE_HTML_ANC_TAG_tr',
 'TABLE_HTML_ANC_TAG_table',
 'TABLE_HTML_ANC_TAG_body',
 'TABLE_HTML_ANC_ATTR_style=height:25pt',
 'TABLE_HTML_ANC_ATTR_style=border-collapse:collapse;margin-left:5.71pt',
 'TABLE_HTML_ANC_ATTR_cellspacing=0',
 u'TABLE_ROW_NGRAM_Cibo',
 u'TABLE_ROW_NGRAM_8.0',
 u'TABLE_ROW_NGRAM_Input',
 u'TABLE_ROW_NGRAM_Input_Capacitance',
 u'TABLE_ROW_NGRAM_Input_Capacitance_-LRB-',
 u'TABLE_ROW_NGRAM_Capacitance',
 u'TABLE_ROW_NGRAM_Capacitance_-LRB-',
 u'TABLE_ROW_NGRAM_Capacitance_-LRB-_VEB',
 u'TABLE_ROW_NGRAM_-LRB-',
 u'TABLE_ROW_NGRAM_-LRB-_VEB',
 u'TABLE_ROW_NGRAM_-LRB-_VEB_=',
 u'TABLE_ROW_NGRAM_VEB',
 u'TABLE_ROW_NGRAM_VEB_='