## Part IV:  Features and Labeling Functions

## Generating Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

### Creating feature matrix

In [5]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [6]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Loading sparse Feature matrix...
CPU times: user 1min 22s, sys: 10.6 s, total: 1min 32s
Wall time: 1min 33s


### Reloading feature matrix

In [7]:
%time F_train = feature_manager.load(session, train, 'Train Features')

CPU times: user 446 ms, sys: 6.99 ms, total: 453 ms
Wall time: 454 ms


In [8]:
F_train

<147x5173 sparse matrix of type '<type 'numpy.float64'>'
	with 32326 stored elements in Compressed Sparse Row format>

In [9]:
F_train.get_candidate(0)

Part_Temp(Span("MMBT3904", parent=32533, chars=[100,107], words=[3,3]), Span("-55", parent=103465, chars=[0,2], words=[0,0]))

In [10]:
F_train.get_key(0)

AnnotationKey (TDL_INV_LEMMA:PARENTS-OF-BETWEEN-MENTION-and-MENTION[None])

In [11]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[-10:]: print f

AnnotationKey (TAB_e1_ROW_POS_TAGS_: nnp)
AnnotationKey (TAB_e1_ROW_POS_TAGS_: nnp nnp)
AnnotationKey (TAB_e1_ROW_POS_TAGS_nnp)
AnnotationKey (TAB_e1_ROW_POS_TAGS_nnp nnp)
AnnotationKey (TAB_e1_ROW_POS_TAGS_nnp nnp nn)
AnnotationKey (TAB_e1_ROW_POS_TAGS_nnp nn)
AnnotationKey (TAB_e1_ROW_POS_TAGS_nnp nn :)
AnnotationKey (TAB_e1_COL_POS_TAGS_ls)
AnnotationKey (TAB_e1_COL_POS_TAGS_ls dt)
AnnotationKey (TAB_e1_COL_POS_TAGS_dt)


## Applying LFs

In [12]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Testing LF helpers

In [13]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams, contains_token
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
cand = train[0]
print cand
print cand[1].parent.cell.text
print get_left_ngrams(cand[1])
print get_right_ngrams(cand[1])
print get_phrase_ngrams(cand[1])
print get_cell_ngrams(cand[1])
print get_neighbor_cell_ngrams(cand[1])
print get_row_ngrams(cand[1])
print get_col_ngrams(cand[1])
print get_aligned_ngrams(cand[1])

Part_Temp(Span("MMBT3904", parent=32533, chars=[100,107], words=[3,3]), Span("-55", parent=103465, chars=[0,2], words=[0,0]))
-55...+150?C
[]
[u'...', u'+150', u'?']
[u'...', u'+150', u'?']
[u'-55', u'...', u'+150', u'?', u'c']
[u'200', u'ma', u't', u'j', u'-55', u'?', u'+150', u'?', u'c']
[u'junction', u'temperature', u'-', u'sperrschichttemperatur', u't', u'j']
[u'v', u'ceo', u'v', u'cbo', u'v', u'ebo', u'350', u'mw', u'1', ')', u'200', u'ma', u'-55', u'?', u'+150', u'?', u'c']
[u'junction', u'temperature', u'-', u'sperrschichttemperatur', u't', u'j', u'v', u'ceo', u'v', u'cbo', u'v', u'ebo', u'350', u'mw', u'1', ')', u'200', u'ma', u'-55', u'?', u'+150', u'?', u'c']


In [14]:
from snorkel.lf_helpers import *

LFs = []

# def LF_training_gold(c):
#     corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
#     training_gold_dict = get_gold_dict(filename, 'stg_temp_min', docs=corpus.documents)
#     return 1 if ((c[0].parent.document.name).upper(), 
#                  (c[0].get_span()).upper(), 
#                  (''.join(c[1].get_span().split())).upper()) in training_gold_dict else 0
# LFs.append(LF_training_gold)

def LF_to_range(c):
    return 1 if 'to' in get_right_ngrams(c) else 0
LFs.append(LF_to_range)

def LF_tilde_range(c):
    return 1 if '~' in get_right_ngrams(c) else 0
LFs.append(LF_tilde_range)

def LF_through_range(c):
    return 1 if set(['through','thru']).intersection(set(get_right_ngrams(c))) > 0 else 0
LFs.append(LF_through_range)

def LF_contains_minus(c):
    return 1 if contains_token(c, '-') or contains_token(c,'-50') else -1
LFs.append(LF_contains_minus)

def LF_storage(c):
    return 1 if 'storage' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_storage)

def LF_tstg(c):
    return 1 if 'tstg' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tstg)

def LF_tj(c):
    return 1 if 'tj' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_tj)

def LF_temperature(c):
    return 1 if 'temperature' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_temperature)

def LF_celsius(c):
    return 1 if 'c' in get_row_ngrams(c, attrib='words') else -1
LFs.append(LF_celsius)

def LF_max(c):
    return 1 if 'max' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_max)

def LF_min(c):
    return 1 if 'min' in get_aligned_ngrams(c, attrib='words') else 0
LFs.append(LF_min)

In [15]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train


Loading sparse Label matrix...
CPU times: user 3.13 s, sys: 138 ms, total: 3.27 s
Wall time: 3.22 s


<147x11 sparse matrix of type '<type 'numpy.float64'>'
	with 1104 stored elements in Compressed Sparse Row format>

In [16]:
L_train.lf_stats()

Unnamed: 0,conflicts,coverage,j,overlaps
LF_to_range,0.428571,0.428571,0,0.428571
LF_tilde_range,0.081633,0.081633,1,0.081633
LF_through_range,1.0,1.0,2,1.0
LF_contains_minus,1.0,1.0,3,1.0
LF_storage,1.0,1.0,4,1.0
LF_tstg,1.0,1.0,5,1.0
LF_tj,1.0,1.0,6,1.0
LF_temperature,1.0,1.0,7,1.0
LF_celsius,1.0,1.0,8,1.0
LF_max,0.0,0.0,9,0.0


In [17]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.