# Entity Extraction from Tables

This notebook demonstrates the full extraction and learning process for _entities_ in tables with a data set of transistor spec sheets, extracting minimum storage temperatures.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# from snorkel import SnorkelSession
# session = SnorkelSession()

### Extraction

In [3]:
from snorkel.parser import HTMLParser
html_parser = HTMLParser(path='data/hardware/hardware_html/')

In [4]:
from snorkel.parser import TableParser
table_parser = TableParser()

In [5]:
try:
    import cPickle
    with open("data/hardware/hardware_corpus.pkl","r") as pkl:
        %time corpus = cPickle.load(pkl)
    print "Corpus has been loaded."
except:
    from snorkel.parser import CorpusParser
    cp = CorpusParser(html_parser, table_parser, max_docs=200)
    %time corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."

CPU times: user 35.3 s, sys: 1.41 s, total: 36.7 s
Wall time: 36.9 s
Corpus has been loaded.


In [6]:
# import cPickle
# with open("data/hardware/hardware_corpus.pkl",'wb') as pkl:
#     %time cPickle.dump(corpus, pkl)
# print "Corpus has been pickled."

CPU times: user 27.9 s, sys: 1.87 s, total: 29.8 s
Wall time: 30.3 s
Corpus has been pickled.


In [7]:
# session.add(corpus)
# session.commit()

In [8]:
# from snorkel.models import Corpus
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Corpus').one()
# corpus

In [9]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
ngrams = TableNgrams(n_max=3)

# Define a matcher
number_matcher = RangeMatcher(low=-70,high=-50)

In [10]:
# from snorkel.candidates import Candidates
# %time candidates = Candidates(ngrams, number_matcher, corpus.get_contexts())
# for c in candidates.get_candidates()[:5]: print c
    
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(ngrams, number_matcher)
%time candidates = ce.extract(corpus.get_tables(), name='all')
for cand in candidates[:10]: 
    print cand
print "%s candidates extracted" % len(candidates)

CPU times: user 25.2 s, sys: 408 ms, total: 25.6 s
Wall time: 25.5 s
Ngram("-55", context=None, chars=[297,299], words=[0,0])
Ngram("-50", context=None, chars=[414,416], words=[0,0])
Ngram("-50", context=None, chars=[494,496], words=[0,0])
Ngram("-50", context=None, chars=[683,685], words=[2,2])
Ngram("-50", context=None, chars=[854,856], words=[2,2])
Ngram("-50", context=None, chars=[991,993], words=[2,2])
Ngram("-55", context=None, chars=[294,296], words=[0,0])
Ngram("-55", context=None, chars=[292,294], words=[0,0])
Ngram("-55", context=None, chars=[355,357], words=[0,0])
Ngram("-50", context=None, chars=[410,412], words=[0,0])
108 candidates extracted


In [11]:
# session.add(candidates)
# session.commit()

In [12]:
# from snorkel.models import CandidateSet
# candidates = session.query(CandidateSet).filter(CandidateSet.name == 'all').one()
# print len(candidates)

### Learning

First, generate gold data.

In [13]:
from utils import collect_hardware_entity_gold
filename='data/hardware/gold_all.csv'
gt_dict = collect_hardware_entity_gold(filename, 'stg_temp_min', candidates)
print "%s out of %s candidates have gold labels" % (len(gt_dict),len(candidates))
print "%s out of %s labeled candidates have positive label" \
        % (gt_dict.values().count(1), len(gt_dict))

98 gold annotations
59 out of 108 candidates have gold labels
51 out of 59 labeled candidates have positive label


Then confirm that features work:

In [None]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)
for f in featurizer.get_features_by_candidate(candidates[0])[:]: print f

DDLIB_WORD_SEQ_[-55]
DDLIB_LEMMA_SEQ_[-55]
DDLIB_POS_SEQ_[CD]
DDLIB_DEP_SEQ_[ROOT]
DDLIB_W_LEFT_1_[_NUMBER]
DDLIB_W_LEFT_POS_1_[CD]
DDLIB_W_LEFT_2_[to _NUMBER]
DDLIB_W_LEFT_POS_2_[TO CD]
DDLIB_W_LEFT_3_[_NUMBER to _NUMBER]
DDLIB_W_LEFT_POS_3_[CD TO CD]
DDLIB_W_RIGHT_1_[to]
DDLIB_W_RIGHT_POS_1_[TO]
DDLIB_W_RIGHT_2_[to _NUMBER]
DDLIB_W_RIGHT_POS_2_[TO CD]
DDLIB_W_LEMMA_L_1_R_1_[_NUMBER]_[to]
DDLIB_W_POS_L_1_R_1_[CD]_[TO]
DDLIB_W_LEMMA_L_1_R_2_[_NUMBER]_[to _NUMBER]
DDLIB_W_POS_L_1_R_2_[CD]_[TO CD]
DDLIB_W_LEMMA_L_2_R_1_[to _NUMBER]_[to]
DDLIB_W_POS_L_2_R_1_[TO CD]_[TO]
DDLIB_W_LEMMA_L_2_R_2_[to _NUMBER]_[to _NUMBER]
DDLIB_W_POS_L_2_R_2_[TO CD]_[TO CD]
DDLIB_W_LEMMA_L_3_R_1_[_NUMBER to _NUMBER]_[to]
DDLIB_W_POS_L_3_R_1_[CD TO CD]_[TO]
DDLIB_W_LEMMA_L_3_R_2_[_NUMBER to _NUMBER]_[to _NUMBER]
DDLIB_W_POS_L_3_R_2_[CD TO CD]_[TO CD]
DDLIB_NUM_WORDS_1
TABLE_ROW_NUM_5
TABLE_COL_NUM_2
TABLE_HTML_TAG_td
TABLE_HTML_ATTR_style=w
TABLE_HTML_ATTR_style=i
TABLE_HTML_ATTR_style=d
TABLE_HTML_ATTR_style=t

We now build a legacy DDLiteModel with which we will do learning.

In [None]:
from snorkel.snorkel import DDLiteModel
DDL = DDLiteModel(candidates, featurizer.fit_transform(candidates), gt_dict)

DDL.set_holdout(idxs=range(int(len(gt_dict)*.8)), validation_frac=0.5)
print "%s training data" % len(DDL.gt.get_labeled_ground_truth('training')[0])
print "%s test data" % len(DDL.gt.get_labeled_ground_truth('test')[0])
print "%s validation data" % len(DDL.gt.get_labeled_ground_truth('validation')[0])

Define labeling functions:

In [None]:
def LF_to(m):
    return 1 if 'to' in m.post_window('words') else 0
def LF_storage(m):
    return 1 if 'storage' in m.aligned_ngrams('words') else -1
def LF_tstg(m):
    return 1 if 'tstg' in m.aligned_ngrams('words') else -1
def LF_temperature(m):
    return 1 if 'temperature' in m.aligned_ngrams('words') else -1

In [None]:
LFs = [LF_to, LF_storage, LF_tstg, LF_temperature]
DDL.apply_lfs(LFs, clear=True)

In [None]:
print DDL.lf_names
DDL.print_lf_stats()

In [None]:
DDL.top_conflict_lfs(n=3)

In [None]:
DDL.lowest_coverage_lfs(n=3)

Now learn, baby, learn!

In [None]:
import matplotlib
import numpy as np
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)
mu_seq = np.ravel([1e-9, 1e-5, 1e-3, 1e-1])
lf_opts = {'sample': False, 'verbose': True}
model_opts = {'sample': False, 'n_iter': 3000, 'alpha': 0.5, 'mu': mu_seq, 'bias': False, 'verbose': True}
%time DDL.train_model(method='lr', lf_opts=lf_opts, model_opts=model_opts)

In [None]:
DDL.show_log()
idxs, gt = DDL.get_labeled_ground_truth(subset=DDL.holdout())
acc_feats = np.mean(DDL.get_predicted(subset=DDL.holdout()) == gt)
print "Full model accuracy: {:.3f}".format(acc_feats)

Tune in next time for relation extraction!