# Episode 2: The [Table] Clone Wars

This notebook demonstrates the full entity extraction process on transistor data sheets, extracting min/max storage temperatures.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
from snorkel.parser import HTMLParser
html_parser = HTMLParser(path='data/hardware_large/')

In [87]:
from snorkel.parser import TableParser
table_parser = TableParser()

In [88]:
from snorkel.parser import Corpus
%time corpus = Corpus(html_parser, table_parser)

Parsing documents...
Parsing contexts...
Parsed 98 documents and 630 contexts
CPU times: user 1min 10s, sys: 3.21 s, total: 1min 13s
Wall time: 1min 50s


In [89]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
number_matcher = RangeMatcher(low=-80,high=0)
# number_matcher = NumberMatcher()

In [90]:
from snorkel.candidates import Candidates
%time candidates = Candidates(table_ngrams, number_matcher, corpus.get_contexts())
for c in candidates.get_candidates()[:10]: print c

Extracting candidates...
CPU times: user 5.59 s, sys: 72.9 ms, total: 5.66 s
Wall time: 5.63 s
<TableNgram("0", id=55-4-245-0:12-12, chars=[12,12], (row,col)=(14,17), tag=td)
<TableNgram("-10", id=79-0-11-0:41-43, chars=[41,43], (row,col)=(3,0), tag=td)
<TableNgram("-1.0", id=2-3-44-0:4-7, chars=[4,7], (row,col)=(9,0), tag=td)
<TableNgram("-5", id=51-2-43-0:4-5, chars=[4,5], (row,col)=(6,1), tag=td)
<TableNgram("0", id=84-3-82-0:4-4, chars=[4,4], (row,col)=(14,2), tag=td)
<TableNgram("0", id=91-2-22-0:14-14, chars=[14,14], (row,col)=(3,2), tag=td)
<TableNgram("-0.01", id=97-4-27-0:3-7, chars=[3,7], (row,col)=(2,8), tag=td)
<TableNgram("0", id=61-3-37-0:16-16, chars=[16,16], (row,col)=(8,0), tag=td)
<TableNgram("-10", id=43-3-70-0:13-15, chars=[13,15], (row,col)=(11,2), tag=td)
<TableNgram("0", id=9-2-23-0:15-15, chars=[15,15], (row,col)=(3,2), tag=td)


In [91]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer(candidates, corpus)

Extracting features...
Extracted 8669 features for each of 1074 mentions


In [92]:
featurizer.get_features_by_id(candidates.get_candidates()[0].id)

['Ngram_features_to_come',
 'TABLE_ROW_NUM_14',
 'TABLE_COL_NUM_17',
 'TABLE_HTML_TAG_td',
 'TABLE_HTML_ATTR_style=border-right: #000000 1px solid;border-bottom: #000000 1px solid;padding: 0px;margin: 0px;width: 180px;vertical-align: bottom;height: 15px;',
 'TABLE_HTML_ATTR_class=tr10',
 'TABLE_HTML_ATTR_class=td65',
 'TABLE_HTML_ANC_TAG_tr',
 'TABLE_HTML_ANC_TAG_table',
 'TABLE_HTML_ANC_TAG_div',
 'TABLE_HTML_ANC_TAG_div',
 'TABLE_HTML_ANC_TAG_div',
 'TABLE_HTML_ANC_TAG_body',
 'TABLE_HTML_ANC_ATTR_cellpadding=0',
 "TABLE_HTML_ANC_ATTR_style=width: 661px;margin-left: 37px;margin-top: 1px;font: 12px 'Helvetica';",
 'TABLE_HTML_ANC_ATTR_cellspacing=0',
 'TABLE_HTML_ANC_ATTR_class=t4',
 'TABLE_HTML_ANC_ATTR_style=position: absolute;top: 0px;left: 0px;z-index: -1;width: 734px;height: 1046px;',
 'TABLE_HTML_ANC_ATTR_id=dimg1',
 'TABLE_HTML_ANC_ATTR_style=position: relative;overflow: hidden;margin: 35px 0px 27px 29px;padding: 0px;border: none;width: 764px;',
 'TABLE_HTML_ANC_ATTR_id=page_2'