# Episode 2: The [Table] Clone Wars

This notebook demonstrates the full entity extraction process on transistor data sheets, extracting min/max storage temperatures.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [100]:
from snorkel.parser import HTMLParser
html_parser = HTMLParser(path='data/hardware_docs/')

In [101]:
from snorkel.parser import TableParser
table_parser = TableParser()

In [102]:
from snorkel.parser import Corpus
%time corpus = Corpus(html_parser, table_parser, max_docs=5)

Parsing documents...
Parsing contexts...
Parsed 5 documents and 54 contexts
CPU times: user 4.5 s, sys: 229 ms, total: 4.73 s
Wall time: 7.21 s


In [148]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
number_matcher = RangeMatcher(low=-80,high=-40)
# number_matcher = NumberMatcher()

In [149]:
from snorkel.candidates import Candidates
%time candidates = Candidates(table_ngrams, number_matcher, corpus.get_contexts())
for c in candidates.get_candidates()[:10]: print c

Extracting candidates...
CPU times: user 404 ms, sys: 53.1 ms, total: 457 ms
Wall time: 421 ms
<TableNgram("-40", id=2-3-10-0:0-2, chars=[0,2], (row,col)=(2,3), tag=td)
<TableNgram("-50", id=2-3-29-0:0-2, chars=[0,2], (row,col)=(5,4), tag=td)
<TableNgram("-55", id=4-0-22-0:0-2, chars=[0,2], (row,col)=(5,2), tag=td)
<TableNgram("-40", id=2-1-10-0:0-2, chars=[0,2], (row,col)=(2,2), tag=td)
<TableNgram("-50", id=2-3-62-0:4-6, chars=[4,6], (row,col)=(14,0), tag=td)
<TableNgram("-50", id=2-3-71-0:4-6, chars=[4,6], (row,col)=(16,0), tag=td)
<TableNgram("-40", id=2-1-6-0:0-2, chars=[0,2], (row,col)=(1,2), tag=td)
<TableNgram("-40", id=2-3-16-0:0-2, chars=[0,2], (row,col)=(3,3), tag=td)
<TableNgram("-50", id=2-3-35-0:0-2, chars=[0,2], (row,col)=(6,4), tag=td)
<TableNgram("-55", id=2-1-22-0:0-2, chars=[0,2], (row,col)=(5,2), tag=td)


In [150]:
# from snorkel.snorkel import Entities
# from snorkel.matchers import CandidateExtractor
# tables = corpus.get_contexts()
# CE = CandidateExtractor(table_ngrams, number_matcher)
# E = Entities(tables, CE)

In [151]:
from snorkel.features import TableNgramFeaturizer

featurizer = TableNgramFeaturizer(candidates, corpus)

Extracting features...
Extracted 317 features for each of 11 mentions


In [153]:
featurizer.get_features_by_id(candidates.get_candidates()[2].id)

[u'BASIC_NGRAM_-55',
 'TABLE_ROW_NUM_5',
 'TABLE_COL_NUM_2',
 'TABLE_HTML_TAG_td',
 'TABLE_HTML_ATTR_style=width:72pt;border-top-style:solid;border-top-width:1pt;border-left-style:solid;border-left-width:1pt;border-bottom-style:solid;border-bottom-width:2pt;border-right-style:solid;border-right-width:1pt',
 'TABLE_HTML_ANC_TAG_tr',
 'TABLE_HTML_ANC_TAG_table',
 'TABLE_HTML_ANC_TAG_body',
 'TABLE_HTML_ANC_ATTR_style=height:14pt',
 'TABLE_HTML_ANC_ATTR_style=border-collapse:collapse;margin-left:39.55pt',
 'TABLE_HTML_ANC_ATTR_cellspacing=0',
 u'TABLE_ROW_NGRAM_\\',
 u'TABLE_ROW_NGRAM_\\_uf0b0C',
 u'TABLE_ROW_NGRAM_uf0b0C',
 u'TABLE_ROW_NGRAM_Operating',
 u'TABLE_ROW_NGRAM_Operating_and',
 u'TABLE_ROW_NGRAM_Operating_and_Storage',
 u'TABLE_ROW_NGRAM_and',
 u'TABLE_ROW_NGRAM_and_Storage',
 u'TABLE_ROW_NGRAM_and_Storage_Junction',
 u'TABLE_ROW_NGRAM_Storage',
 u'TABLE_ROW_NGRAM_Storage_Junction',
 u'TABLE_ROW_NGRAM_Storage_Junction_Temperature',
 u'TABLE_ROW_NGRAM_Junction',
 u'TABLE_ROW_NG

We now build a legacy DDLiteModel with which we will do the learning stages.

In [142]:
from snorkel.snorkel import DDLiteModel
DDL = DDLiteModel(candidates.get_candidates(), featurizer.get_features())

In [155]:
# in line with storage, tstg, temp, temperature
# followed by 'to' 

def LF_to(m):
    return 1 if 'to' in m.post_window('lemmas') else 0

# def LF_storage(m):
#     return 1 if ('storage' in m.)

# def LF_gene(m):
#     return 1 if ('gene' in m.post_window('lemmas')) or ('gene' in m.pre_window('lemmas')) else 0
# def LF_gene_dp(m):
#     return 1 if 'gene' in [m.lemmas[m.dep_parents[i] - 1] for i in m.idxs] else 0
# def LF_genotype_dp(m):
#     return 1 if 'genotype' in [m.lemmas[m.dep_parents[i] - 1] for i in m.idxs] else 0
# def LF_mutant(m):
#     return 1 if ('mutant' in m.post_window('lemmas')) or ('mutant' in m.pre_window('lemmas')) else 0
# def LF_variant(m):
#     return 1 if ('variant' in m.post_window('lemmas')) or ('variant' in m.pre_window('lemmas')) else 0
# def LF_express(m):
#     return 1 if ('express' in m.post_window('lemmas')) or ('express' in m.pre_window('lemmas')) else 0
# def LF_mutation(m):
#     return 1 if 'mutation' in [m.lemmas[m.dep_parents[i] - 1] for i in m.idxs] else 0
# def LF_JJ(m):
#     return 1 if 'JJ' in m.post_window('poses') else 0
# def LF_IN(m):
#     return 1 if 'IN' in m.pre_window('poses', 1) else 0

In [156]:
LFs = [LF_to]
DDL.apply_lfs(LFs, clear=False)

AttributeError: 'TableNgram' object has no attribute 'post_window'