# Entity Extraction from Tables

This notebook demonstrates the full extraction and learning process for _entities_ in tables with a data set of transistor spec sheets, extracting minimum storage temperatures.

In [6]:
%load_ext autoreload
%autoreload 2

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[autoreload of snorkel.parser failed: Traceback (most recent call last):
  File "/Users/bradenhancock/anaconda/lib/python2.7/site-packages/IPython/extensions/autoreload.py", line 247, in check
    superreload(m, reload, self.old_objects)
NameError: name 'HTMLOmniParser' is not defined
]


### Extraction

In [7]:
load_pickle = False # with pickle ~15s; without pickle ~75s
save_pickle = False

corpus_loaded = False
import cPickle
if load_pickle:
    try:
        with open("data/hardware/hardware_corpus.pkl","r") as pkl:
            %time corpus = cPickle.load(pkl)
        corpus_loaded = True
        print "Corpus has been loaded."
    except:
        print "Corpus could not be loaded."
        print "Corpus will be parsed instead..."
if not corpus_loaded:
    from snorkel.parser import CorpusParser
    from snorkel.parser import HTMLParser
    from snorkel.parser import TableParser

    html_parser = HTMLParser(path='data/hardware/hardware_html/')
    table_parser = TableParser()

    cp = CorpusParser(html_parser, table_parser, max_docs=5)
    %time corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."

    if save_pickle:
        with open("data/hardware/hardware_corpus.pkl","w") as pkl:
            %time cPickle.dump(corpus, pkl)
            print "Corpus has been pickled."

CPU times: user 3 s, sys: 321 ms, total: 3.32 s
Wall time: 4.96 s
Corpus has been parsed.


In [8]:
# from snorkel import SnorkelSession
# session = SnorkelSession()
# session.add(corpus)
# session.commit()
# from snorkel.models import Corpus
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Corpus').one()
# corpus

In [9]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
ngrams = TableNgrams(n_max=2)

# Define a matcher
number_matcher = RangeMatcher(low=-70,high=-50)

In [10]:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(ngrams, number_matcher)
%time candidates = ce.extract(corpus.get_tables(), name='all')
for cand in candidates[:10]: 
    print cand
print "%s candidates extracted" % len(candidates)

CPU times: user 123 ms, sys: 33.5 ms, total: 157 ms
Wall time: 134 ms
Span("-55", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-55", context=None, chars=[0,2], words=[0,0])
7 candidates extracted


### Learning

First, generate gold data.

In [11]:
from utils import collect_hardware_entity_gold
filename='data/hardware/gold_all.csv'
(gold_candidates, gold_labels) = collect_hardware_entity_gold(filename, 'stg_temp_min', candidates)
print "%s out of %s candidates have gold labels" % (len(gold_candidates), len(candidates))
print "%s out of %s labeled candidates have positive label" % (gold_labels.count(1), len(gold_candidates))

# Split into train and test set
training_candidates = []
n_half = len(candidates)/2
for idx, c in enumerate(candidates[:n_half]):
    if c in gold_candidates:
        gold_candidates.append(c)
        gold_labels.append(gold_labels[idx])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
# print "Training set size: %s" % len(training_candidates)
# print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in gold set: %s" % sum(gold_labels==1)
print "Negative labels in gold set: %s" % sum(gold_labels==-1)

98 gold annotations
2 out of 7 candidates have gold labels
2 out of 2 labeled candidates have positive label
Positive labels in gold set: 3
Negative labels in gold set: 0


Let's take a quick peek at the features:

In [None]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)
for f in featurizer.get_features_by_candidate(candidates[0])[:10]: print f

Define labeling functions:

In [None]:
def LF_to_range(m):
    return 1 if 'to' in m.post_window('words') else 0
def LF_tilde_range(m):
    return 1 if '~' in m.post_window('words') else 0
def LF_storage(m):
    return 1 if 'storage' in m.aligned_ngrams('words') else -1
def LF_tstg(m):
    return 1 if 'tstg' in m.aligned_ngrams('words') else -1
def LF_tj(m):
    return 1 if 'tj' in m.aligned_ngrams('words') else -1
def LF_temperature(m):
    return 1 if 'temperature' in m.aligned_ngrams('words') else -1
def LF_celsius(m):
    return 1 if 'c' in m.aligned_ngrams('words') else -1
def LF_max(m):
    return 1 if 'max' in m.aligned_ngrams('words') else 0
def LF_min(m):
    return 1 if 'min' in m.aligned_ngrams('words') else 0

In [None]:
LFs = [LF_to_range, LF_tilde_range, LF_storage, LF_tstg, LF_tj, LF_temperature,
      LF_celsius, LF_max, LF_min]

In [None]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=TableNgramFeaturizer())

In [None]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

Now learn, baby, learn!

In [None]:
from snorkel.snorkel import Learner
from snorkel.learning import LogReg

learner = Learner(training_set, model=LogReg(bias_term=True))

In [None]:
# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [None]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

In [None]:
gs_stats

In [None]:
learner.test(test_candidates, test_labels)

In [None]:
learner.feature_stats(n_max=10)

Tune in next time for relation extraction!