## Part IV:  Features and Labeling Functions

## Generating Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [None]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [None]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Creating feature matrix

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

### Reloading feature matrix

In [None]:
%time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
F_train

In [None]:
F_train.get_candidate(0)

In [None]:
F_train.get_key(0)

In [None]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[:10]: print f

### Saving feature matrices

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## Applying LFs

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [2]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining Labeling Functions

In [3]:
from snorkel.lf_helpers import *

LFs = []

# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# def LF_training_gold(c):
#     training_gold_dict = get_gold_dict(gold_file, 'stg_temp_min')
#     return 1 if ((c[0].parent.document.name).upper(), 
#                  (c[0].get_span()).upper(), 
#                  (''.join(c[1].get_span().split())).upper()) in training_gold_dict else 0
# LFs.append(LF_training_gold)

# from hardware_utils import load_extended_parts_dict
# gold_file ='data/hardware/hardware_gold.csv'
# parts_dict = load_extended_parts_dict(gold_file)
# print "Loaded %d part numbers." % len(parts_dict)

# def LF_parts_dict(c):
#     return 1 if (c.temp.parent.words)[0] in parts_dict else -1
# LFs.append(LF_foo)

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1, n_max=1) else 0
LFs.append(LF_to_right)

# def LF_tilde_right(c):
#     return 1 if '~' in get_right_ngrams(c.temp, window=1) else 0
# LFs.append(LF_tilde_right)

# def LF_through_right(c):
#     return 1 if len(set(['through','thru']).intersection(
#         set(get_right_ngrams(c.temp, window=1)))) > 0 else 0
# LFs.append(LF_through_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=4, n_max=1) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp, n_max=2) else 0
LFs.append(LF_storage_row)

# def LF_tj_row(c):
#     return 1 if 'tj' in get_row_ngrams(c.temp) else 0
# LFs.append(LF_tj_row)

def LF_c_row(c):
    return 1 if '%C' in get_row_ngrams(c.temp, n_min=1, n_max=2, case_sensitive=True) else 0
LFs.append(LF_c_row)

# def LF_same_table(c):
#     return 1 if same_table(c) else 0
# LFs.append(LF_same_table)

# BOTH

def LF_tstg_row(c):
    row_ngrams = get_row_ngrams(c.temp, n_max=1)
    return 1 if ('tstg' in row_ngrams or 'ts' in row_ngrams) else -1
LFs.append(LF_tstg_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else -1
LFs.append(LF_temperature_row)

# NEGATIVE

def LF_BC_with_suffix(c):
    part_name = c.part.get_span()
    if part_name.startswith('BC'):
        for suffix in ['A','B','C','-16','-25','-40']:
            if part_name.endswith(suffix):
                return 0
        return -1
    else:
        return 0
LFs.append(LF_BC_with_suffix)

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.table is None else 0
LFs.append(LF_temp_outside_table)

def LF_complement_left(c):
    return -1 if 'complement' in get_left_ngrams(c.part) else 0
LFs.append(LF_complement_left)

def LF_voltage_row(c):
    return -1 if 'voltage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_voltage_row)

Loaded 1286 part numbers.


### Testing LFs and LF_helpers

In [4]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams
from snorkel.lf_helpers import contains_token, contains_regex
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
c = train[12]
print c.part
print LF_BC_with_suffix(c)
print c.temp
print c.temp.parent
print c.temp.parent.table
print same_document(c)
print same_table(c)
print contains_regex(c, rgx=r'849')
print LF_temp_outside_table(c)

Span("BC817K", parent=87254, chars=[0,5], words=[0,0])
-1
ImplicitSpan("-55", parent=87287, words=[0,0], position=[0])
Phrase(Document DISES00645-1, 2173, 27817, 0, u'-55 - +150 - C')
Table(Document DISES00645-1, 0)
True
True
False
0


In [5]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train


Loading sparse Label matrix...
CPU times: user 3min 9s, sys: 16.2 s, total: 3min 25s
Wall time: 3min 23s


<11549x11 sparse matrix of type '<type 'numpy.float64'>'
	with 48767 stored elements in Compressed Sparse Row format>

In [6]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [7]:
# print type(L_train)
# print L_train.lf_stats()
%time L_train.lf_accuracy(train_gold)


CPU times: user 2.73 s, sys: 546 ms, total: 3.28 s
Wall time: 3.21 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_foo,0.756429,0.299247,1.0,0,1.0
LF_to_right,0.668076,0.122868,0.122868,1,0.122868
LF_150_right,0.380531,0.078275,0.078275,2,0.078275
LF_storage_row,0.50639,0.250671,0.250671,3,0.250671
LF_c_row,,0.0,0.0,4,0.0
LF_tstg_row,0.775305,0.299247,1.0,5,1.0
LF_temperature_row,0.801108,0.299247,1.0,6,1.0
LF_BC_with_suffix,0.898068,0.157156,0.586977,7,0.586977
LF_temp_outside_table,0.755769,0.000173,0.045026,8,0.045026
LF_complement_left,1.0,0.000606,0.001212,9,0.001212


In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.

# TEMPORARY

In [None]:
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance
corpus = get_ORM_instance(Corpus, session, 'Hardware')

for d in corpus:
    if d.name == 'PHGLS18216-1':
        doc = d
print doc

In [None]:
for phrase in doc.phrases:
    for ngram in tokens_to_ngrams(phrase.words, n_max=2, n_min=2):
        if ngram == '- C':
            import pdb; pdb.set_trace()