# Relation Extraction from Tables

This notebook demonstrates the full extraction and learning process for _relations_ in tables with a data set of transistor spec sheets, extracting (temperature label, min storage temperature) pairs. (Eventually, this tutorial will be updated to extract (part number, min storage temperature) pairs.

In [None]:
%load_ext autoreload
%autoreload 2

### Extraction

Use this option if you would like to unpickle a previously parsed corpus, (i.e., the corpus from the Entity Extraction notebook).

In [1]:
load_pickle = False
corpus_loaded = False

if load_pickle:
    try:
        import cPickle
        with open("data/hardware/hardware_corpus.pkl","r") as pkl:
            %time corpus = cPickle.load(pkl)
        corpus_loaded = True
        print "Corpus has been loaded."
    except:
        print "Corpus could not be loaded."
        print "Corpus will be parsed instead..."
if not corpus_loaded:
    from snorkel.parser import CorpusParser
    from snorkel.parser import HTMLParser
    from snorkel.parser import TableParser

    html_parser = HTMLParser(path='data/hardware/hardware_html/')
    table_parser = TableParser()

    cp = CorpusParser(html_parser, table_parser, max_docs=15)
    %time corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."

CPU times: user 6.23 s, sys: 100 ms, total: 6.33 s
Wall time: 9.72 s
Corpus has been parsed.


We now create two EntityExtractor objects, one for each component in the relation.

In [2]:
from snorkel.candidates import TableNgrams, EntityExtractor, RelationExtractor
from snorkel.matchers import RegexMatchEach, DictionaryMatch, RangeMatcher
from load_dictionaries import load_hardware_dictionary

# Select a CandidateSpace
table_ngrams = TableNgrams(n_max=2)

# Entity Extractor 1: Part numbers
parts = load_hardware_dictionary()
print "Loaded %s part numbers." % len(parts)

# NOTE: this will not catch items in ranges (e.g., BC546-BC548) or lists (e.g., BC546/547/548)
part_matcher = DictionaryMatch(d=parts, longest_match_only=False)
part_extractor = EntityExtractor(table_ngrams, part_matcher)

# Entity Extractor 2: Temperatures (min storage temperature)
# temp_matcher = RangeMatcher(low=140,high=160)
temp_matcher = RegexMatchEach(rgx=r'1[3-7][05]', attrib='words')
temp_extractor = EntityExtractor(table_ngrams, temp_matcher)

# Relation Extractor:
relation_extractor = RelationExtractor(part_extractor, temp_extractor)

Loaded 179 part numbers.


In [3]:
# %time candidates = temp_extractor.extract(corpus.get_tables(), name='all')
%time candidates = relation_extractor.extract(corpus.get_tables(), name='all')

for cand in candidates[:10]: 
    print cand
print "%s candidates extracted" % len(candidates)

CPU times: user 1.54 s, sys: 18.9 ms, total: 1.56 s
Wall time: 1.56 s
SpanPair(Span("BC546", context=None, chars=[122,126], words=[24,24]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC546", context=None, chars=[122,126], words=[24,24]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC546", context=None, chars=[122,126], words=[24,24]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC547", context=None, chars=[0,4], words=[0,0]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC547", context=None, chars=[0,4], words=[0,0]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC547", context=None, chars=[0,4], words=[0,0]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC548", context=None, chars=[0,4], words=[0,0]), Span("150", context=None, chars=[0,2], words=[0,0]))
SpanPair(Span("BC548", context=None, chars=[0,4], words=[0,0]), Span("150", context=None, cha

In [4]:
candidates[0].span0

Span("BC546", context=None, chars=[122,126], words=[24,24])

In [None]:
from utils import collect_hardware_relation_gold
filename = 'data/hardware/gold_all.csv'
gt_dict = collect_hardware_relation_gold(filename, 'stg_temp_max', candidates)
print "%s out of %s candidates have gold labels" % (len(gt_dict),len(candidates))
print "%s out of %s labeled candidates have positive label" \
        % (gt_dict.values().count(1), len(gt_dict))

In [None]:
from snorkel.features import TableNgramPairFeaturizer
featurizer = TableNgramPairFeaturizer()
featurizer.fit_transform(candidates)
for f in featurizer.get_features_by_candidate(candidates[0])[:10]: print f
for in featurizer.get_features_by_candidate(candidates[0])[-10:]: print f

In [None]:
from snorkel.snorkel import DDLiteModel
DDL = DDLiteModel(candidates, featurizer.fit_transform(candidates), gt_dict)

DDL.set_holdout(idxs=range(int(len(gt_dict)*.5)), validation_frac=0.5)
print "%s training data" % len(DDL.gt.get_labeled_ground_truth('training')[0])
print "%s test data" % len(DDL.gt.get_labeled_ground_truth('test')[0])
print "%s validation data" % len(DDL.gt.get_labeled_ground_truth('validation')[0])

In [None]:
# Part number LFs
# TODO

# Temperature LFs
def LF_to_range(m):
    return 1 if 'to' in m.ngram1.pre_window('words') else 0
def LF_tilde_range(m):
    return 1 if '~' in m.ngram1.pre_window('words') else 0
def LF_storage(m):
    return 1 if 'storage' in m.ngram1.aligned_ngrams('words') else -1
def LF_tstg(m):
    return 1 if 'tstg' in m.ngram1.aligned_ngrams('words') else -1
def LF_tj(m):
    return 1 if 'tj' in m.ngram1.aligned_ngrams('words') else -1
def LF_temperature(m):
    return 1 if 'temperature' in m.ngram1.aligned_ngrams('words') else -1
def LF_celsius(m):
    return 1 if 'c' in m.ngram1.aligned_ngrams('words') else 0
def LF_max(m):
    return 1 if 'max' in m.ngram1.aligned_ngrams('lemmas') else 0
def LF_min(m):
    return 1 if 'min' in m.ngram1.aligned_ngrams('lemmas') else 0

# Combined LFs
def LF_same_row(m):
    return 1 if m.ngram0.context.row_num == m.ngram1.context.row_num else 0
def LF_same_col(m):
    return 1 if m.ngram0.context.col_num == m.ngram1.context.col_num else 0

# NOTE: don't forget to add LF name to LFs below

In [None]:
LFs = [LF_to_range, LF_tilde_range, LF_storage, LF_tstg, LF_tj, LF_temperature, 
       LF_max, LF_min, LF_same_row, LF_same_col]
DDL.apply_lfs(LFs, clear=True)

In [None]:
print DDL.lf_names
DDL.print_lf_stats()

In [None]:
DDL.lowest_coverage_lfs(n=10)

In [None]:
import matplotlib
import numpy as np
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,4)
mu_seq = np.ravel([1e-9, 1e-5, 1e-3, 1e-1])
lf_opts = {'sample': False, 'verbose': True}
model_opts = {'sample': False, 'n_iter': 3000, 'alpha': 0.5, 'mu': mu_seq, 'bias': False, 'verbose': True}
%time DDL.train_model(method='lr', lf_opts=lf_opts, model_opts=model_opts)