## Part II: `Candidate` Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the Training `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [3]:
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance

corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 80 Documents


## Defining a `Candidate` Schema

In [4]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Writing a basic `CandidateExtractor`

In [5]:
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

part_ngrams = OmniNgramsPart(n_max=3, split_tokens=None)
temp_ngrams = OmniNgramsTemp(n_max=3, split_tokens=None)

In [6]:
# from snorkel.matchers import DictionaryMatch

# from hardware_utils import load_extended_parts_dict
# gold_file ='data/hardware/hardware_gold.csv'
# parts_dict = load_extended_parts_dict(gold_file) # NOTE: this include A/B/C/-16/-25/-40 
# print "Loaded %d part numbers." % len(parts_dict)
# parts_matcher = DictionaryMatch(d=parts_dict)

In [7]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,4}[\s]?[A-Z]{0,2}[0-9]?([-][A-Z0-9]{1,3})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,3})?)')

parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

In [8]:
from snorkel.matchers import RegexMatchSpan

temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]')

In [None]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [part_ngrams, temp_ngrams], [parts_matcher, temp_matcher])

## Running the `CandidateExtractor`

In [None]:
%time train = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "%s contains %d Candidates" % (train, len(train))

-> for tc in self.matchers[i].apply([ngram]):
(Pdb) print ngram
TemporaryImplicitSpan("BC546BRL1G", parent=101, words=[0,0], position=[4])
(Pdb) print i
0
(Pdb) print self.matchers[i]
<snorkel.matchers.Union object at 0x113387e10>


In [None]:
for c in train[:3]:
    print c

### Saving the extracted candidates

In [None]:
session.add(train)
session.commit()

### Reloading the candidates

In [None]:
from snorkel.models import CandidateSet
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
print "%s contains %d Candidates" % (train, len(train))

### Repeating for development and test corpora

In [None]:
for corpus_name in ['Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

In [None]:
# train = get_ORM_instance(Corpus, session, 'Hardware Training')
# dev = get_ORM_instance(Corpus, session, 'Hardware Development')
# test = get_ORM_instance(Corpus, session, 'Hardware Test')
# trainies = [d.name for d in train.documents]
# len(trainies)
# for d in test.documents:
#     if d.name in trainies:
#         print 'YES!'
# # for d in test.documents[:10]: print d

## TEMPORARY - Assessing Total Recall

In [None]:
from hardware_utils import entity_level_total_recall
from snorkel.utils import get_ORM_instance

train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()
total_set = set([])
for c in train:
    total_set.add(c)
for c in dev:
    total_set.add(c)

In [None]:
print len(total_set)

In [None]:
# import os
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(total_set, gold_file, relation=True)

In [None]:
# from snorkel.utils import ProgressBar
# target = sorted(list(fn))[-1]
# print target
# print "-------------------------------"
# pb = ProgressBar(len(candidates))
# for i, c in enumerate(list(candidates)[:]):
#     pb.bar(i)
#     if (c.part.parent.document.name.upper() == target[0].upper())
#         and c.part.get_span().upper() == target[1].upper()):
#         print c
# pb.close()
for c in sorted(list(tp))[:15]:
    print c
print "-------------------------------"
for c in sorted(list(fn))[:15]:
    print c

In [None]:
for c in total_set:
    if c.part.get_span() == 'BC546BZL1G':
        import pdb; pdb.set_trace()

In [None]:
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()

# for doc in corpus.documents:
#     if doc.name == 'PNJIS00254-1':
#         d = doc
#         break
# print d

In [None]:
# for phrase in d.phrases:
#     if '55' in phrase.words:
#         p = phrase
#         print p.cell
#         import pdb; pdb.set_trace()

In [None]:
# candies = sorted(candidates, key=lambda x: x[0])

In [None]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time count_hardware_labels(candidates, filename, attrib='stg_temp_min', attrib_class='temp')

## TEMPORARY - Return to Normalcy

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.