## Part IV:  Features and Labeling Functions

## Generating Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Loading the `CandidateSet`

In [4]:
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Creating feature matrix

In [5]:
from snorkel.fast_annotations import FeatureManager

feature_manager = FeatureManager()

In [6]:
%time F_train = feature_manager.create(session, train, 'Train Features')


Bulk upserting 16144847 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 32min 55s, sys: 15min 2s, total: 47min 57s
Wall time: 52min 59s


In [7]:
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)


Bulk upserting 889374 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 7min 40s, sys: 1min 53s, total: 9min 33s
Wall time: 10min 10s


### Investigating feature matrix

In [8]:
F_train

<85377x21487 sparse matrix of type '<type 'numpy.float64'>'
	with 16144847 stored elements in Compressed Sparse Row format>

In [9]:
F_dev

<9502x21487 sparse matrix of type '<type 'numpy.float64'>'
	with 889374 stored elements in Compressed Sparse Row format>

In [10]:
F_train.get_candidate(0)

Part_Temp(ImplicitSpan("MMBT3904", parent=92260, words=[0,0], position=[0]), ImplicitSpan("-55", parent=10795, words=[7,7], position=[0]))

In [11]:
F_train.get_key(0)

AnnotationKey (TAB_e1_HTML_TAG_p)

In [12]:
from snorkel.utils import get_keys_by_candidate

for f in get_keys_by_candidate(F_train, F_train.get_candidate(1))[:10]: print f

AnnotationKey (TAB_e1_HTML_TAG_p)
AnnotationKey (TAB_e1_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[body])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[table])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[tr])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[td])
AnnotationKey (TAB_e1_HTML_ANC_TAG_[p])
AnnotationKey (TAB_e2_HTML_TAG_p)
AnnotationKey (TAB_e2_HTML_ANC_TAG_[html])
AnnotationKey (TAB_e2_HTML_ANC_TAG_[body])


### Saving feature matrices

In [13]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## Applying LFs

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()

In [31]:
from snorkel.annotations import LabelManager

label_manager = LabelManager()

### Defining Labeling Functions

In [32]:
from snorkel.lf_helpers import *

LFs = []

# POSITIVE

def LF_to_right(c):
    return 1 if 'to' in get_right_ngrams(c.temp, window=1) else 0
LFs.append(LF_to_right)

def LF_150_right(c):
    return 1 if '150' in get_right_ngrams(c.temp, window=4) else 0
LFs.append(LF_150_right)

def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_storage_row)

def LF_operating_row(c):
    return 1 if 'operating' in get_row_ngrams(c.temp) else 0
LFs.append(LF_operating_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else 0
LFs.append(LF_temperature_row)

def LF_tstg_row(c):
    return 1 if overlap(
        ['tstg','stg','ts'], 
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_tstg_row)

# NEGATIVE

def LF_not_temp_relevant(c):
    return -1 if not overlap(
        ['storage','temperature','tstg','ts','stg'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_not_temp_relevant)

def LF_temp_outside_table(c):
    return -1 if c.temp.parent.row is None else 0
LFs.append(LF_temp_outside_table)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

def LF_collector_aligned(c):
    return -1 if overlap(
        ['collector', 'collector-current', 'collector-base', 'collector-emitter'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_collector_aligned)

def LF_current_aligned(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['current', 'dc', 'ic'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_current_aligned)

def LF_voltage_row_temp(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_temp)

def LF_voltage_row_part(c):
    ngrams = get_aligned_ngrams(c.part)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_part)

def LF_typ_row(c):
    return -1 if overlap(
        ['typ', 'typ.'],
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_typ_row)

def LF_test_condition_aligned(c):
    return -1 if overlap(
        ['test', 'condition'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_test_condition_aligned)

def LF_complement_left_row(c):
    return -1 if 'complement' in get_row_ngrams(c.part) else 0
LFs.append(LF_complement_left_row)

In [33]:
# LF2s = []
# from snorkel.models import AnnotationKey
# from snorkel.utils import get_ORM_instance

# %time F_train = feature_manager.load(session, train, 'Train Features')

# def feature_lookup(candidate, entity, ngram, helper, attrib='words'):
#     if helper == 'get_row_ngrams':
#         feature = entity + "_ROW_%s_[%s]" % (attrib.upper(), ngram)
#         key = get_ORM_instance(AnnotationKey, session, feature)
#     return F_train[get_row_index(candidate), get_col_index(key)]
    
# def LF2_storage_row(c):
#     return 1 if feauture_lookup(c, 'e1', 'storage', 'get_row_ngrams') else 0
# LF2s.append(LF_storage_row)

# def LF2_operating_row(c):
#     return 1 if 'operating' in get_row_ngrams(c.temp) else 0
# LF2s.append(LF_operating_row)


### Testing LFs and LF_helpers

In [34]:
from snorkel.lf_helpers import get_right_ngrams, get_left_ngrams
from snorkel.lf_helpers import contains_token, contains_regex
from snorkel.lf_helpers import get_phrase_ngrams, get_cell_ngrams, get_neighbor_cell_ngrams
from snorkel.lf_helpers import get_row_ngrams, get_col_ngrams, get_aligned_ngrams
from snorkel.lf_helpers import same_document, same_table, same_cell, same_phrase
c = train[12]
print c.part
print c.temp
print c.temp.parent
print c.temp.parent.table
print same_document(c)
print same_table(c)
print contains_regex(c, rgx=r'849')

ImplicitSpan("BC817W", parent=99496, words=[4,4], position=[7])
ImplicitSpan("-65", parent=99136, words=[0,1], position=[0])
Phrase(Doc: PHGLS20126-1, Table: 13, Row: 13, Col: 6, Position: 0, Text: - 65)
Table(Doc: PHGLS20126-1, Position: 13)
True
False
False


In [35]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Generating annotations for 85377 candidates...

Loading sparse Label matrix...
CPU times: user 30min 33s, sys: 4min 49s, total: 35min 23s
Wall time: 36min 45s


<85377x16 sparse matrix of type '<type 'numpy.float64'>'
	with 305479 stored elements in Compressed Sparse Row format>

In [36]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [37]:
# TEMP
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# from snorkel.models import candidate_subclass
# Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

# train_gold = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

# from snorkel.annotations import LabelManager
# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')
# L_train
# TEMP

In [38]:
# L_train.lf_stats()
%time L_train.lf_stats(train_gold)


CPU times: user 1.06 s, sys: 73.4 ms, total: 1.13 s
Wall time: 1.15 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_to_right,1.0,0.02551,0.149806,0,0.149806
LF_150_right,1.0,0.020778,0.094674,1,0.094674
LF_storage_row,1.0,0.072689,0.256697,2,0.256697
LF_operating_row,1.0,0.026729,0.108015,3,0.108015
LF_temperature_row,1.0,0.104431,0.302833,4,0.29132
LF_tstg_row,1.0,0.045984,0.233213,5,0.233213
LF_not_temp_relevant,0.843985,4.7e-05,0.694438,6,0.677466
LF_temp_outside_table,0.671078,0.032093,0.384155,7,0.384155
LF_too_many_numbers_row,0.891062,0.003713,0.162889,8,0.162889
LF_collector_aligned,0.875247,0.0,0.224955,9,0.224955


In [39]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

Next, in Part 5, we will test our model on the development `CandidateSet`.

# TEMPORARY

In [40]:
# print c.temp
# print c.temp.parent

In [41]:
# print list(get_row_ngrams(c.temp, attrib="ner_tags"))
# print list(get_row_ngrams(c.temp, attrib="words"))

In [42]:
# from pprint import pprint
# print(list(get_row_ngrams(c.temp, attrib="ner_tags")).count('o'))

In [43]:
# print LF_v_row(c)