# Tables in Snorkel: Extracting Attributes from Spec Sheets

## Part IV:  Features and Labeling Functions

In [1]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Generating Features

### Loading the `CandidateSets`

In [3]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Automatically creating features

In [4]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

We first generate features for the Training `CandidateSet`.

In [5]:
%time F_train = feature_manager.create(session, train, 'Training Features')


Loading sparse Feature matrix...
CPU times: user 21min 44s, sys: 11.7 s, total: 21min 56s
Wall time: 22min 10s


We then calculate features for the Development `CandidateSet`, ignoring all those features which were not seen in Training set, since their effect on the accuracy of our classifier is unknown.

In [6]:
%time F_dev = feature_manager.update(session, dev, 'Training Features', expand_key_set=False)


Loading sparse Feature matrix...
CPU times: user 11.3 s, sys: 351 ms, total: 11.7 s
Wall time: 11.8 s


### Reloading feature matrix

In [7]:
%time F_train = feature_manager.load(session, train, 'Training Features')

CPU times: user 12.2 s, sys: 422 ms, total: 12.7 s
Wall time: 12.6 s


### Exploring the generated `Feature` set

In [8]:
F_train

<6571x3820 sparse matrix of type '<type 'numpy.float64'>'
	with 1028662 stored elements in Compressed Sparse Row format>

In [9]:
F_train.get_candidate(0)

Part_Temp(ImplicitSpan("2N3906", parent=653, words=[0,0], position=[0]), ImplicitSpan("-50", parent=11554, words=[0,0], position=[0]))

In [10]:
F_train.get_key(0)

AnnotationKey (TDL_LEMMA:RIGHT-OF-MENTION[color])

In [11]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(0)): print f

AnnotationKey (TDL_LEMMA:RIGHT-OF-MENTION[color])
AnnotationKey (TAB_e1_WORD_SEQ_[s25])
AnnotationKey (TAB_e1_LEMMA_SEQ_[s25])
AnnotationKey (TAB_e1_POS_SEQ_[NN])
AnnotationKey (TAB_e1_DEP_SEQ_[compound])
AnnotationKey (TAB_e1_W_LEFT_1_[''])
AnnotationKey (TAB_e1_W_LEFT_POS_1_[''])
AnnotationKey (TAB_e1_W_LEFT_2_[. ''])
AnnotationKey (TAB_e1_W_LEFT_POS_2_[. ''])
AnnotationKey (TAB_e1_W_LEFT_3_['' . ''])
AnnotationKey (TAB_e1_W_LEFT_POS_3_['' . ''])
AnnotationKey (TAB_e1_W_RIGHT_1_[{])
AnnotationKey (TAB_e1_W_RIGHT_POS_1_[-LRB-])
AnnotationKey (TAB_e1_W_RIGHT_2_[{ color])
AnnotationKey (TAB_e1_W_RIGHT_POS_2_[-LRB- NN])
AnnotationKey (TAB_e1_W_RIGHT_3_[{ color :])
AnnotationKey (TAB_e1_W_RIGHT_POS_3_[-LRB- NN :])
AnnotationKey (TAB_e1_W_LEMMA_L_1_R_1_['']_[{])
AnnotationKey (TAB_e1_W_POS_L_1_R_1_['']_[-LRB-])
AnnotationKey (TAB_e1_W_LEMMA_L_1_R_2_['']_[{ color])
AnnotationKey (TAB_e1_W_POS_L_1_R_2_['']_[-LRB- NN])
AnnotationKey (TAB_e1_W_LEMMA_L_1_R_3_['']_[{ color :])
AnnotationKey (TAB

## Creating Labeling Functions

In [12]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining LFs

In [13]:
from snorkel.lf_helpers import *

LFs = []

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1, n_max=1) else 0
LFs.append(LF_to_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=4, n_max=1) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp, n_max=2) else 0
LFs.append(LF_storage_row)

def LF_c_row(c):
    return 1 if '%C' in get_row_ngrams(c.temp, n_min=1, n_max=2, case_sensitive=True) else 0
LFs.append(LF_c_row)

def LF_same_table(c):
    return 1 if same_table(c) else 0
LFs.append(LF_same_table)

# BOTH

def LF_tstg_row(c):
    row_ngrams = get_row_ngrams(c.temp, n_max=1)
    return 1 if ('tstg' in row_ngrams or 'ts' in row_ngrams) else -1
LFs.append(LF_tstg_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else -1
LFs.append(LF_temperature_row)

# NEGATIVE

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.table is None else 0
LFs.append(LF_temp_outside_table)

def LF_complement_left(c):
    return -1 if 'complement' in get_left_ngrams(c.part) else 0
LFs.append(LF_complement_left)

def LF_voltage_row(c):
    return -1 if 'voltage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_voltage_row)

### Applying LFs

In [14]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train


Loading sparse Label matrix...
CPU times: user 1min 6s, sys: 1.04 s, total: 1min 7s
Wall time: 1min 7s


<6571x10 sparse matrix of type '<type 'numpy.float64'>'
	with 17484 stored elements in Compressed Sparse Row format>

### Evaluating LFs

Because we have gold labels for our Training `CandidateSet`, we are able to see the accuracy of our LFs on the training set as a development tool. Where gold labels are not available, the other three stat categories (coverage, overlaps, conflicts) are still available.

In [15]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [16]:
%time L_train.lf_stats(train_gold)


CPU times: user 3.04 s, sys: 459 ms, total: 3.5 s
Wall time: 3.79 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_to_right,1.0,0.243494,0.243494,0,0.243494
LF_150_right,1.0,0.002435,0.002435,1,0.002435
LF_storage_row,1.0,0.291128,0.291128,2,0.291128
LF_c_row,,0.0,0.0,3,0.0
LF_same_table,0.997519,0.12266,0.12266,4,0.12266
LF_tstg_row,0.161924,0.392786,1.0,5,1.0
LF_temperature_row,0.453051,0.392786,1.0,6,1.0
LF_temp_outside_table,,0.0,0.0,7,0.0
LF_complement_left,,0.0,0.0,8,0.0
LF_voltage_row,1.0,0.000152,0.001065,9,0.001065


Next, in Part 5, we will test our model on the development `CandidateSet`.