## Part II: `Candidate` Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Loading the `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [3]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 50 Documents


## Defining a `Candidate` Schema

In [4]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Writing a basic `CandidateExtractor`

In [5]:
from snorkel.candidates import OmniNgrams

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)

In [6]:
from hardware_utils import load_extended_parts_dict

filename='data/hardware/hardware_gold.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

Loaded 5331 part numbers.


In [7]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=ur'-\s?[5-7][05]')

In [8]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [omni_ngrams, omni_ngrams], [parts_matcher, temp_matcher])

## Running the `CandidateExtractor`

In [9]:
%time candidates = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "%s contains %d Candidates" % (candidates, len(candidates))


CPU times: user 14.2 s, sys: 554 ms, total: 14.7 s
Wall time: 14.6 s
Candidate Set (Hardware Training Candidates) contains 147 Candidates


In [10]:
for c in candidates[:5]:
    print c

Part_Temp(Span("MMBT3904", parent=32533, chars=[100,107], words=[3,3]), Span("-55", parent=103465, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC818K", parent=2350, chars=[9,14], words=[2,2]), Span("-55", parent=103833, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC817K-25", parent=33355, chars=[22,30], words=[5,5]), Span("-55", parent=103833, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC817K-40", parent=33355, chars=[44,52], words=[10,10]), Span("-55", parent=103833, chars=[0,2], words=[0,0]))
Part_Temp(Span("BC547C", parent=83138, chars=[9,14], words=[2,2]), Span("-65", parent=124987, chars=[0,2], words=[0,0]))


### Saving the extracted candidates

In [11]:
session.add(candidates)
session.commit()

### Reloading the candidates

In [12]:
from snorkel.models import CandidateSet

candidates = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
print "%s contains %d Candidates" % (candidates, len(candidates))

Candidate Set (Hardware Training Candidates) contains 147 Candidates


### Repeating for development and test corpora

In [13]:
for corpus_name in ['Hardware Development', 'Hardware Test']:
    corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(c)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()


CPU times: user 7.44 s, sys: 229 ms, total: 7.67 s
Wall time: 7.66 s
Candidate Set (Hardware Development Candidates) contains 405 Candidates

CPU times: user 7.13 s, sys: 161 ms, total: 7.29 s
Wall time: 7.24 s
Candidate Set (Hardware Test Candidates) contains 80 Candidates


In [14]:
# If necessary
# import os
# os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.