# polarity

## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Clearing db

In [2]:
from snorkel.models.meta import clear_database
clear_database()

## Parsing

In [3]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [4]:
import os
#from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
doc_parser = HTMLParser()
context_parser = AsyncOmniParser(pdf_path=pdf_path, visual=True)
%time corpus = parse_corpus(session, 'Hardware', docs_path, doc_parser, context_parser, max_docs=10, parallel=3)

CPU times: user 48 ms, sys: 8 ms, total: 56 ms
Wall time: 2.26 s


### Run Parser

In [5]:
# raw_input()
# %time corpus = cp.parse_corpus(name='Hardware', session=session)

# session.add(corpus)
# session.commit()

### Split Corpus

In [6]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.6, development=0.4, test=0, seed=3)

6 Documents added to corpus Hardware Training
4 Documents added to corpus Hardware Development


## Extraction

In [7]:
from snorkel.models import candidate_subclass

Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

### Define Matchers

In [8]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

polarity_matcher = RegexMatchSpan(rgx=r'(pnp|npn)', ignore_case=True)

### Define ContextSpaces

In [None]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, get_gold_dict
from collections import defaultdict
import os

# Make parts list
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
# parts_by_doc = defaultdict(set)
# for part in gold_parts:
#     parts_by_doc[part[0]].add(part[1])

part_ngrams = OmniNgramsPart(n_max=3, parts_by_doc=None) # NOTE: linking is turned off
polarity_ngrams = OmniNgrams(n_max=1)

### Run CandidateExtractor

In [None]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.async_candidates import parallel_extract
from snorkel.utils import get_ORM_instance
ce = CandidateExtractor(Part_Polarity, 
                        [part_ngrams, polarity_ngrams], 
                        [parts_matcher, polarity_matcher], 
                        throttler=None)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    
    print "Extracting Candidates from %s" % corpus
    print corpus.documents.all()
    %time candidates=parallel_extract(session, ce, corpus, corpus_name + ' Candidates', parallel=2)
#     %time candidates = ce.extract(\
#         corpus, corpus_name + ' Candidates', session)
#     session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()
raw_input()

Extracting Candidates from Corpus (Hardware Training)
[Document MCCCS08818-1, Document MCCCS09741-1, Document RECTS01214-1, Document FAIRS19194-1, Document DISES00645-1, Document BC182-D]
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 593 ms
Candidate Set (Hardware Training Candidates) contains 261 Candidates
Extracting Candidates from Corpus (Hardware Development)
[Document MMBT3904, Document DIODS13249-1, Document UTCLS01324-1, Document KECCS05435-1]
CPU times: user 8 ms, sys: 8 ms, total: 16 ms
Wall time: 303 ms
Candidate Set (Hardware Development Candidates) contains 213 Candidates


### Assess Recall

In [None]:
# from snorkel.models import Candidate
# from hardware_utils import entity_level_total_recall

# all_candidates = session.query(Candidate).all()
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# (tp, fp, fn) = entity_level_total_recall(
#     all_candidates, gold_file, 'polarity', relation=True)

In [None]:
# from pprint import pprint
# pprint(fn)
# print len(fn)

## Gold Labels

In [None]:
# import os
# from snorkel.models import CandidateSet
# from hardware_utils import load_hardware_labels

# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# for set_name in ['Training', 'Development']:
#     candidate_set_name = 'Hardware %s Candidates' % set_name
#     candidates = session.query(CandidateSet).filter(
#         CandidateSet.name == candidate_set_name).one()
#     label_set_name = 'Hardware %s Candidates -- Gold' % set_name
#     annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
#     %time gold_candidates, annotation_key = load_hardware_labels(session,\
#                            label_set_name, \
#                            annotation_key_name, \
#                            candidates, \
#                            gold_file, \
#                            'polarity')
#     candidates_gold = session.query(CandidateSet).filter(
#         CandidateSet.name == candidate_set_name + ' -- Gold').one()
#     print "%d/%d Candidates in %s have positive Labels" % (
#         len(candidates_gold), len(candidates), candidates)

### Assess Baseline Performance

In [None]:
# from snorkel.models import Corpus, Candidate
# from snorkel.utils import get_ORM_instance
# from snorkel.lf_helpers import *
# from hardware_utils import entity_level_f1
# from itertools import chain

# def LF_all_true(c):
#     return True

# def LF_complement_nearby(c):
#     return 1 if set(['complement','complementary']).isdisjoint(
#         chain.from_iterable(
#             [get_phrase_ngrams(c.part), 
#              get_phrase_ngrams(c.polarity),
#              get_neighbor_phrase_ngrams(c.polarity)])) else 0

# candidates = session.query(Candidate).all()
# (tp, fp, tn, fn) = baseline(candidates, LF_complement_nearby)

# corpus = get_ORM_instance(Corpus, session, 'Hardware')
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# (TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

In [None]:
# from pprint import pprint
# pprint(FP)

In [None]:
# pprint(FN)

In [None]:
# from hardware_utils import entity_to_candidates

# print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
# print c

In [None]:
# from snorkel.models import ImplicitSpan
# from snorkel.lf_helpers import *
# phrases = []
# print list(get_neighbor_phrase_ngrams(c.polarity))
# print c.polarity.parent
# print phrases
# print "SPAN_TYPE_[%s]" % ('IMPLICIT' if isinstance(c.polarity, ImplicitSpan) else 'EXPLICIT')
# # list(get_phrase_ngrams(c.polarity))

## Features

### Extract Features

In [None]:
from snorkel.models import CandidateSet
from snorkel.async_annotations import annotate
from snorkel.utils import get_ORM_instance

# Run batch extraction jobs
train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

%time F_train = annotate(train, parallel=4)
%time F_dev = annotate(dev, parallel=4, keyset = 'Hardware Training Candidates')

print 'Training features hash:', hash(frozenset(F_train.keys)),'of length', len(F_train.keys)

In [None]:
# from snorkel.models import CandidateSet
# from snorkel.fast_annotations import FeatureManager
# from snorkel.utils import get_ORM_instance

# train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
# dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

# feature_manager = FeatureManager()
# %time F_train_sync = feature_manager.create(session, train, 'Train Features')
# %time F_dev_sync = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)
# print F_train_sync.shape
# # Integrity check for async against original feature extractor
# from snorkel.models.annotation import AnnotationKeySet
# sync_keyset = session.query(AnnotationKeySet).filter(AnnotationKeySet.name=='Train Features').one()
# print 'Async extractor extracted the same set of features as the original impl:',set(key.name for key in sync_keyset) == set(F_train.keys)

In [None]:
raw_input()

## LFs

In [None]:
# If necessary
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ featurized snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()

### Define LFs

In [None]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
import re
label_manager = LabelManager()

LFs = []

def LF_complement_phrase_part(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.part)) else 0
LFs.append(LF_complement_phrase_part)

def LF_complement_phrase_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_phrase_polarity)

def LF_complement_neighbor_polarity(c):
    return -1 if overlap(['complement','complementary'], 
                         get_neighbor_phrase_ngrams(c.polarity)) else 0
LFs.append(LF_complement_neighbor_polarity)

def LF_top_mark_col_part(c):
    return -1 if overlap(['top','mark'],
                         get_col_ngrams(c.part)) else 0
LFs.append(LF_top_mark_col_part)

def LF_endswith_D_part(c):
    return -1 if c.part.get_span().endswith('D') else 0
LFs.append(LF_endswith_D_part)

def LF_default_positive(c):
    return 1 if not overlap(['complement','complementary'],
                            chain.from_iterable([
                            get_phrase_ngrams(c.part),
                            get_phrase_ngrams(c.polarity),
                            get_neighbor_phrase_ngrams(c.polarity)])) else 0
LFs.append(LF_default_positive)

In [None]:
from snorkel.models import Candidate
from hardware_utils import entity_to_candidates

candidates = session.query(Candidate).all()
print len(candidates)
# matches = entity_to_candidates((u'SIEMS01215-1', u'BC856', u'NPN'), candidates)
# c = matches[0]
c = candidates[0]
print candidates[0]

print LF_complement_phrase_part(c)
print LF_complement_phrase_polarity(c)
print LF_complement_neighbor_polarity(c)
print LF_top_mark_col_part(c)
print LF_endswith_D_part(c)
print LF_default_positive(c)

### Apply LFs

In [None]:
from snorkel.async_annotations import annotate
%time L_train = annotate(train, lfs=LFs)
L_train

### Assess LF accuracy

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

## Learn and Evaluate

In [None]:
# If necessary:
# import os
# os.remove('snorkel.db');
# os.system('cp snorkel.db\ features snorkel.db');

# from snorkel import SnorkelSession
# session = SnorkelSession()

# import sys
# sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

# from snorkel.models import candidate_subclass
# Part_Polarity = candidate_subclass('Part_Polarity', ['part','polarity'])

# from snorkel.models import CandidateSet
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Development Candidates').one()

# from snorkel.annotations import FeatureManager, LabelManager
# feature_manager = FeatureManager()
# %time F_train = feature_manager.load(session, train, 'Train Features')
# %time F_dev = feature_manager.load(session, dev, 'Train Features')

# label_manager = LabelManager()
# %time L_train = label_manager.load(session, train, 'LF Labels')

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=15000, rate=1e-2)

#%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-5)
#%time disc_model.save(session, "Discriminative Params")

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

In [None]:
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

In [None]:
from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev_sync = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

L_dev = annotate(dev, lfs=LFs)

In [None]:
print train_marginals.shape

In [None]:
tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

In [None]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, corpus, 'polarity')

In [None]:
from pprint import pprint
FP_list = sorted(list(FP))
pprint(FP_list[:10])

In [None]:
print '\n'.join(unicode(x[0]) for x in disc_model.get_feature_weights(F_dev))

In [None]:
from hardware_utils import entity_to_candidates

entity = FP_list[0]
matches = entity_to_candidates(entity, fp)
print "Entity: (%d matches)" % len(matches)
print entity

candidate = matches[0]
print "\nCandidate:"
print candidate

print "\nScore:"
print disc_model.get_candidate_score(candidate, F_dev)

print "\nFeatures:"
pprint(disc_model.get_candidate_feature_weights(candidate, F_dev))

In [None]:
from hardware_utils import part_error_analysis
part_error_analysis(candidate)