## Part IV:  Features and Labeling Functions

## Generating Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Creating feature matrix

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [6]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Loading sparse Feature matrix...
CPU times: user 3min 24s, sys: 16.5 s, total: 3min 40s
Wall time: 3min 45s


In [7]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)


Loading sparse Feature matrix...
CPU times: user 1min 51s, sys: 5.5 s, total: 1min 57s
Wall time: 1min 57s


### Reloading feature matrix

In [8]:
%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 1.7 s, sys: 20.7 ms, total: 1.72 s
Wall time: 1.72 s


In [9]:
F_train

<319x11085 sparse matrix of type '<type 'numpy.float64'>'
	with 100311 stored elements in Compressed Sparse Row format>

In [10]:
F_train.get_candidate(0)

Part_Temp(Span("BC337-16", parent=3898, chars=[0,7], words=[0,0]), Span("-55", parent=3927, chars=[22,24], words=[3,3]))

In [11]:
F_train.get_key(0)

AnnotationKey (TDL_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])

In [12]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[:]: print f

AnnotationKey (TDL_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])
AnnotationKey (TDL_LEMMA:BETWEEN-MENTION-and-MENTION[bc337-16])
AnnotationKey (TDL_DEP_LABEL:BETWEEN-MENTION-and-MENTION[ROOT])
AnnotationKey (TDL_DEP_LABEL|LEMMA:BETWEEN-MENTION-and-MENTION[ROOT|bc337-16])
AnnotationKey (TDL_LEMMA:SEQ-BETWEEN[/])
AnnotationKey (TDL_LEMMA:SEQ-BETWEEN[25/40])
AnnotationKey (TDL_LEMMA:SEQ-BETWEEN[/ 25/40])
AnnotationKey (TDL_LEMMA:RIGHT-OF-MENTION[25/40])
AnnotationKey (TDL_SEQ-BETWEEN[LEN:0-4])
AnnotationKey (TDL_BETWEEN-MENTION-and-MENTION[LEN:0-2])
AnnotationKey (TAB_e1_WORD_SEQ_[BC337-16])
AnnotationKey (TAB_e1_LEMMA_SEQ_[bc337-16])
AnnotationKey (TAB_e1_POS_SEQ_[JJ])
AnnotationKey (TAB_e1_DEP_SEQ_[ROOT])
AnnotationKey (TAB_e1_W_LEFT_1_[25/40])
AnnotationKey (TAB_e1_W_LEFT_POS_1_[CD])
AnnotationKey (TAB_e1_W_LEFT_2_[/ 25/40])
AnnotationKey (TAB_e1_W_LEFT_POS_2_[: CD])
AnnotationKey (TAB_e1_W_LEFT_3_[bc338-16 / 25/40])
AnnotationKey (TAB_e1_W_LEFT_POS_3_[NN : CD])
AnnotationKey (TA

## Applying LFs

In [13]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining Labeling Functions

In [14]:
from snorkel.lf_helpers import *

LFs = []

# def LF_training_gold(c):
#     corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
#     training_gold_dict = get_gold_dict(filename, 'stg_temp_min', docs=corpus.documents)
#     return 1 if ((c[0].parent.document.name).upper(), 
#                  (c[0].get_span()).upper(), 
#                  (''.join(c[1].get_span().split())).upper()) in training_gold_dict else 0
# LFs.append(LF_training_gold)

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1) else 0
LFs.append(LF_to_right)

def LF_tilde_right(c):
    return 1 if '~' in get_right_ngrams(c.temp, window=1) else 0
LFs.append(LF_tilde_right)

def LF_through_right(c):
    return 1 if len(set(['through','thru']).intersection(
        set(get_right_ngrams(c.temp, window=1)))) > 0 else 0
LFs.append(LF_through_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=3) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_storage_row)

def LF_tj_row(c):
    return 1 if 'tj' in get_row_ngrams(c.temp) else 0
LFs.append(LF_tj_row)

def LF_c_row(c):
    return 1 if 'c' in get_row_ngrams(c.temp) else 0
LFs.append(LF_c_row)

def LF_same_table(c):
    return 1 if same_table(c) else 0
LFs.append(LF_same_table)

# BOTH

def LF_tstg_row(c):
    return 1 if 'tstg' in get_row_ngrams(c.temp) else -1
LFs.append(LF_tstg_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else -1
LFs.append(LF_temperature_row)

# NEGATIVE

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.table is None else 0
LFs.append(LF_temp_outside_table)

def LF_complement_left(c):
    return -1 if 'complement' in get_left_ngrams(c.part) else 0
LFs.append(LF_complement_left)

def LF_voltage_row(c):
    return -1 if 'voltage' in get_row_ngrams(c.temp) else 0

### Testing LFs and LF_helpers

In [15]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams
from snorkel.lf_helpers import contains_token, contains_regex
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
c = train[19]
print c.part
print c.temp
print c.temp.parent
print c.temp.parent.table
print same_document(c)
print same_table(c)
print contains_regex(c, rgx=r'849')
print LF_temp_outside_table(c)

Span("BC548", parent=1417, chars=[14,18], words=[4,4])
Span("-65", parent=95441, chars=[0,2], words=[0,0])
Phrase(Document BC547, 0, u'-65 to +150')
Table(Document BC547, 1)
True
False
False
0


In [17]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train


Loading sparse Label matrix...
CPU times: user 11.9 s, sys: 2.94 s, total: 14.9 s
Wall time: 14.9 s


<319x13 sparse matrix of type '<type 'numpy.float64'>'
	with 1316 stored elements in Compressed Sparse Row format>

In [18]:
L_train.lf_stats()

Unnamed: 0,conflicts,coverage,j,overlaps
LF_to_right,0.38558,0.38558,0,0.38558
LF_tilde_right,0.081505,0.081505,1,0.081505
LF_through_right,0.0,0.0,2,0.0
LF_150_right,0.250784,0.250784,3,0.250784
LF_minus_regex,1.0,1.0,4,1.0
LF_storage_row,0.0,0.0,5,0.0
LF_tj_row,0.0,0.0,6,0.0
LF_c_row,0.031348,0.031348,7,0.031348
LF_same_table,0.122257,0.122257,8,0.122257
LF_tstg_row,1.0,1.0,9,1.0


In [19]:
# If necessary
# import os
# os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.