## Part II: `Candidate` Extraction

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [1]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [2]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Training').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 75 Documents


## Defining a `Candidate` Schema

In [3]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Writing a basic `CandidateExtractor`

In [None]:
from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsHardware

omni_ngrams = OmniNgrams(n_max=3, split_tokens=None)
hardware_ngrams = OmniNgramsHardware(n_max=3, split_tokens=None)

In [None]:
from hardware_utils import load_extended_parts_dict

filename='data/hardware/hardware_gold.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

In [None]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=ur'-\s?[5-7][05]')

In [None]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [hardware_ngrams, omni_ngrams], [parts_matcher, temp_matcher])

## Running the `CandidateExtractor`

In [None]:
%time candidates = ce.extract(corpus.documents, 'Hardware Training Candidates', session)
print "%s contains %d Candidates" % (candidates, len(candidates))

In [None]:
for c in candidates[:5]:
    print c

### Saving the extracted candidates

In [None]:
session.add(candidates)
session.commit()

### Reloading the candidates

In [4]:
from snorkel.models import CandidateSet

candidates = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
print "%s contains %d Candidates" % (candidates, len(candidates))

Candidate Set (Hardware Training Candidates) contains 26359 Candidates


### Repeating for development and test corpora

In [None]:
for corpus_name in ['Hardware Development', 'Hardware Test']:
    corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(c)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

## TEMPORARY - Assessing Total Recall

In [5]:
# TEMP
from hardware_utils import entity_level_total_recall

train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()
total_set = set([])
for c in train:
    total_set.add(c)
for c in dev:
    total_set.add(c)

In [6]:
print len(total_set)
print train[0]

34488
Part_Temp(ImplicitSpan("BC846AWA", parent=108606, words=[0,0], position=[4]), Span("-55", parent=108455, chars=[0,2], words=[0,0]))


In [9]:
import os
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(total_set, gold_file, 'stg_temp_min')

Preparing gold set...
Preparing candidates...

Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 2758 
Entity-level Gold: 542
Intersection Candidates: 488
----------------------------------------
Overlap with Gold:  0.90



In [34]:
target = sorted(list(fn))[3]
print target
print "-------------------------------"
for c in list(total_set)[:]:
# #     print c.part.parent.document.name.upper()
# #     print target[0].upper()
# #     print c.part.get_span().upper()
# #     print target[1].upper()
    if (c.part.parent.document.name.upper() == 'BC337-D'): #target[0].upper()):
# #         and c.part.get_span().upper() == target[1].upper()):
        print c

# for c in sorted(list(fn))[:10]:
#     print c

('BC818-40LT1-D', 'BC818-40L', '-55')
-------------------------------


## TEMPORARY - Return to Normalcy

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.