## Part IV:  Features and Labeling Functions

## Generating Features

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [None]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Creating feature matrix

In [None]:
from snorkel.fast_annotations import FeatureManager

feature_manager = FeatureManager()

In [None]:
%time F_train = feature_manager.create(session, train, 'Train Features')

In [None]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

### Investigating feature matrix

In [None]:
F_train

In [None]:
F_dev

In [None]:
F_train.get_candidate(0)

In [None]:
F_train.get_key(0)

In [None]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[:10]: print f

### Saving feature matrices

In [None]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## Applying LFs

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [None]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining Labeling Functions

In [None]:
from snorkel.lf_helpers import *

LFs = []

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1) else 0
LFs.append(LF_to_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=4) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_storage_row)

def LF_operating_row(c):
    return 1 if 'operating' in get_row_ngrams(c.temp) else 0
LFs.append(LF_operating_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else 0
LFs.append(LF_temperature_row)

def LF_tstg_row(c):
    return 1 if overlap(
        ['tstg','stg','ts'], 
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_tstg_row)

# NEGATIVE

def LF_not_temp_relevant(c):
    return -1 if not overlap(
        ['storage','temperature','tstg','ts','stg'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_not_temp_relevant)

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.row is None else 0
LFs.append(LF_temp_outside_table)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

def LF_collector_aligned(c):
    return -1 if overlap(
        ['collector', 'collector-current', 'collector-base', 'collector-emitter'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_collector_aligned)

def LF_current_aligned(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['current', 'dc', 'ic'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_current_aligned)

def LF_voltage_row_temp(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_temp)

def LF_voltage_row_part(c):
    ngrams = get_aligned_ngrams(c.part)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_part)

def LF_typ_row(c):
    return -1 if overlap(
        ['typ', 'typ.'],
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_typ_row)

def LF_test_condition_aligned(c):
    return -1 if overlap(
        ['test', 'condition'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_test_condition_aligned)

def LF_complement_left_row(c):
    return -1 if 'complement' in get_row_ngrams(c.part) else 0
LFs.append(LF_complement_left_row)

In [None]:
# LF2s = []
# from snorkel.models import AnnotationKey
# from snorkel.utils import get_ORM_instance

# %time F_train = feature_manager.load(session, train, 'Train Features')

# def feature_lookup(candidate, entity, ngram, helper, attrib='words'):
#     if helper == 'get_row_ngrams':
#         feature = entity + "_ROW_%s_[%s]" % (attrib.upper(), ngram)
#         key = get_ORM_instance(AnnotationKey, session, feature)
#     return F_train[get_row_index(candidate), get_col_index(key)]
    
# def LF2_storage_row(c):
#     return 1 if feauture_lookup(c, 'e1', 'storage', 'get_row_ngrams') else 0
# LF2s.append(LF_storage_row)

# def LF2_operating_row(c):
#     return 1 if 'operating' in get_row_ngrams(c.temp) else 0
# LF2s.append(LF_operating_row)


### Testing LFs and LF_helpers

In [9]:
from snorkel.lf_helpers import *
from hardware_utils import part_error_analysis

c = train[12]
part_error_analysis(c)

print same_document(c)
print same_table(c)
print same_row(c)
print same_col(c)
print same_cell(c)
print same_phrase(c)
print list(get_phrase_ngrams(c))
print list(get_phrase_ngrams(c.temp))
print list(get_neighbor_phrase_ngrams(c))
print list(get_cell_ngrams(c))
print list(get_neighbor_cell_ngrams(c))
print list(get_row_ngrams(c))
print list(get_col_ngrams(c))
print list(get_aligned_ngrams(c))
print list(get_head_ngrams(c, 'north'))

print contains_regex(c, rgx=r'849')

Doc: Document PHGLS20126-1
------------
Part:
ImplicitSpan("BC817W", parent=99496, words=[4,4], position=[7])
Table: Table(Doc: PHGLS20126-1, Position: 31)
Row: 16
Col: 1
Phrase: Phrase(Doc: PHGLS20126-1, Table: 31, Row: 16, Col: 1, Position: 0, Text: I C /I B  = 10)
------------
Attr:
ImplicitSpan("-65", parent=99136, words=[0,1], position=[0])
Table: Table(Doc: PHGLS20126-1, Position: 13)
Row: 13
Col: 6
Phrase: Phrase(Doc: PHGLS20126-1, Table: 13, Row: 13, Col: 6, Position: 0, Text: - 65)
------------
True
False
False
False
False
False
[u'i', u'c', u'/', u'i', u'=', u'10']
[]
[u'i', u'c', '(', u'ma', ')', u'%', u'c', u'storage', u'temperature', u'+150']
[u'i', u'c', u'/', u'i', u'=', u'10']
[u'i', u'c', u'/', u'i', u'=', u'10', u'%', u'c', u't', u'amb', u'=', u'150', u'+150', u'-', u'-']
[u'i', u'c', u'/', u'i', u'=', u'10', u'%', u'c', u'i', u'c', u'/', u'i', u'b', u'=', u'10', u'%', u'c', u't', u'stg', u'storage', u'temperature', u'+150', u'%', u'c']
[u'i', u'c', u'/', u'i', u'=', 

NameError: global name 'c' is not defined

In [None]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [None]:
# TEMP
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

# train_gold = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

# from snorkel.annotations import LabelManager
# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')
# L_train
# TEMP

In [None]:
# L_train.lf_stats()
%time L_train.lf_stats(train_gold)

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.

# TEMPORARY

In [None]:
# print c.temp
# print c.temp.parent

In [None]:
# print list(get_row_ngrams(c.temp, attrib="ner_tags"))
# print list(get_row_ngrams(c.temp, attrib="words"))

In [None]:
# from pprint import pprint
# print(list(get_row_ngrams(c.temp, attrib="ner_tags")).count('o'))

In [None]:
# print LF_v_row(c)