## Part IV:  Features and Labeling Functions

## Generating Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [1]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [3]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Creating feature matrix

In [None]:
from snorkel.annotations import FeatureManager

feature_manager = FeatureManager()

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

### Reloading feature matrix

In [None]:
# %time F_train = feature_manager.load(session, train, 'Train Features')

In [None]:
F_train

In [None]:
F_dev

In [None]:
F_train.get_candidate(0)

In [None]:
F_train.get_key(0)

In [7]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[:10]: print f

NameError: name 'F_train' is not defined

### Saving feature matrices

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## Applying LFs

In [None]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ featurized snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

In [None]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining Labeling Functions

In [None]:
from snorkel.lf_helpers import *
from itertools import chain

LFs = []

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1, n_max=1) else 0
LFs.append(LF_to_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=4, n_max=1) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp, n_max=2) else 0
LFs.append(LF_storage_row)

def LF_operating_row(c):
    return 1 if 'operating' in get_row_ngrams(c.temp, n_max=2) else 0
LFs.append(LF_operating_row)

def LF_C_phrase(c):
    ngrams =  get_phrase_ngrams(c.temp, n_min=2, n_max=2, lower=False)
    return 1 if ('% C' in ngrams or '%C' in ngrams) else 0
LFs.append(LF_C_phrase)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else 0
LFs.append(LF_temperature_row)

def LF_tstg_row(c):
    row_ngrams = get_row_ngrams(c.temp, n_max=1)
    return 1 if ('tstg' in row_ngrams or 
                 'ts' in row_ngrams or 
                 't stg' in row_ngrams or
                 'stg' in row_ngrams) else 0
LFs.append(LF_tstg_row)

# NEGATIVE

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.row is None else 0
LFs.append(LF_temp_outside_table)

def LF_collector_row(c):
    ngrams = get_row_ngrams(c.temp, n_max=1)
    return -1 if ('collector' in ngrams or
                  'v ceo' in ngrams or
                  'collector-emitter' in ngrams) else 0
LFs.append(LF_collector_row)

def LF_typ_cell(c):
    cell_ngrams = get_cell_ngrams(c.temp, n_max=1)
    row_ngrams = get_row_ngrams(c.temp, n_max=1)
    return -1 if ('typ' in cell_ngrams or
                  'typ.' in cell_ngrams or
                  'typ' in row_ngrams or
                  'typ.' in row_ngrams) else 0
LFs.append(LF_typ_cell)

def LF_test_phrase(c):
    return -1 if 'test' in get_row_ngrams(c.temp, n_max=1) else 0
LFs.append(LF_test_phrase)

def LF_condition_phrase(c):
    return -1 if 'condition' in get_row_ngrams(c.temp, n_max=1) else 0
LFs.append(LF_condition_phrase)

def LF_complement_left_row(c):
    return -1 if 'complement' in get_row_ngrams(c.part, n_max=1) else 0
LFs.append(LF_complement_left_row)

def LF_voltage_row_temp(c):
    return -1 if ('voltage' in get_row_ngrams(c.temp, n_max=1, infer=True)) else 0
LFs.append(LF_voltage_row_temp)

def LF_voltage_row_part(c):
    return -1 if ('voltage' in get_row_ngrams(c.part, n_max=1)) else 0
LFs.append(LF_voltage_row_part)

def LF_v_row(c):
    return -1 if 'v' in get_row_ngrams(c.temp, infer=True) else 0
LFs.append(LF_v_row)

def LF_vceo_cell(c):
    return -1 if 'vceo' in get_cell_ngrams(c.temp) else 0
LFs.append(LF_vceo_cell)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

### Testing LFs and LF_helpers

In [None]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams
from snorkel.lf_helpers import contains_token, contains_regex
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
c = train[12]
print c.part
print c.temp
print c.temp.parent
print c.temp.parent.table
print same_document(c)
print same_table(c)
print contains_regex(c, rgx=r'849')


In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

In [4]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [6]:
# L_train.lf_stats()
%time L_train.lf_stats(train_gold)


CPU times: user 1 s, sys: 111 ms, total: 1.12 s
Wall time: 1.13 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_to_right,1.0,0.039015,0.149806,0,0.149806
LF_150_right,1.0,0.029317,0.094674,1,0.094674
LF_storage_row,1.0,0.058529,0.256697,2,0.256697
LF_operating_row,1.0,0.037926,0.108015,3,0.108015
LF_C_phrase,0.335379,0.245851,0.265317,4,0.265317
LF_temperature_row,1.0,0.070113,0.302833,5,0.274453
LF_tstg_row,1.0,0.008902,0.035326,6,0.035326
LF_temp_outside_table,0.671078,0.277545,0.384155,7,0.353011
LF_collector_row,0.500729,0.0,0.05621,8,0.05621
LF_typ_cell,1.0,0.0,0.020907,9,0.020907


In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.

# TEMPORARY

In [None]:
# print c.temp
# print c.temp.parent

In [None]:
# print list(get_row_ngrams(c.temp, attrib="ner_tags"))
# print list(get_row_ngrams(c.temp, attrib="words"))

In [None]:
# from pprint import pprint
# print(list(get_row_ngrams(c.temp, attrib="ner_tags")).count('o'))

In [None]:
# print LF_v_row(c)