## Part II: `Candidate` Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [3]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware) contains 100 Documents


## Defining a `Candidate` Schema

In [4]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
# Part = candidate_subclass('Part', ['part'])

## Writing a basic `CandidateExtractor`

In [5]:
# from snorkel.candidates import OmniNgrams
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

part_ngrams = OmniNgramsPart(n_max=3, split_tokens=None)
temp_ngrams = OmniNgramsTemp(n_max=3, split_tokens=None)

In [6]:
from hardware_utils import load_extended_parts_dict

filename='data/hardware/hardware_gold.csv'
parts_dict = load_extended_parts_dict(filename)
print "Loaded %d part numbers." % len(parts_dict)

Loaded 1286 part numbers.


In [7]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]')

In [8]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [part_ngrams, temp_ngrams], [parts_matcher, temp_matcher])
# ce = CandidateExtractor(Part, [hardware_ngrams], [parts_matcher])

## Running the `CandidateExtractor`

In [9]:
%time candidates = ce.extract(corpus.documents, 'Hardware Candidates', session)
print "%s contains %d Candidates" % (candidates, len(candidates))


CPU times: user 1min 11s, sys: 1.67 s, total: 1min 13s
Wall time: 1min 14s
Candidate Set (Hardware Candidates) contains 12617 Candidates


In [10]:
for c in candidates[:5]:
    print c

Part_Temp(Span("BC846W", parent=34437, chars=[115,120], words=[3,3]), ImplicitSpan("-55", parent=90183, words=[0,0], position=[0]))
Part_Temp(Span("BC846AW", parent=90266, chars=[0,6], words=[0,0]), ImplicitSpan("-55", parent=90183, words=[0,0], position=[0]))
Part_Temp(Span("BC846BW", parent=90269, chars=[0,6], words=[0,0]), ImplicitSpan("-55", parent=90183, words=[0,0], position=[0]))
Part_Temp(Span("BC846W", parent=3123, chars=[0,5], words=[0,0]), ImplicitSpan("-55", parent=90183, words=[0,0], position=[0]))
Part_Temp(Span("BC807", parent=1920, chars=[0,4], words=[0,0]), ImplicitSpan("-55", parent=87050, words=[0,0], position=[0]))


### Saving the extracted candidates

In [11]:
session.add(candidates)
session.commit()

### Reloading the candidates

In [12]:
from snorkel.models import CandidateSet

candidates = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Candidates').one()
print "%s contains %d Candidates" % (candidates, len(candidates))

Candidate Set (Hardware Candidates) contains 12617 Candidates


### Repeating for development and test corpora

In [13]:
# for corpus_name in ['Hardware Development', 'Hardware Test']:
#     corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
#     %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
#     session.add(c)
#     print "%s contains %d Candidates" % (candidates, len(candidates))
# session.commit()

## TEMPORARY - Assessing Total Recall

In [14]:
# TEMP
# from hardware_utils import entity_level_total_recall
# from snorkel.utils import get_ORM_instance

# candidates = get_ORM_instance(CandidateSet, session, 'Hardware Candidates')
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Development Candidates').one()
# total_set = set([])
# for c in train:
#     total_set.add(c)
# for c in dev:
#     total_set.add(c)

In [15]:
import os
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
(tp, fp, fn) = entity_level_total_recall(candidates, gold_file, relation=True)

Preparing gold set...
Preparing candidates...

Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1041 
Entity-level Gold: 542
Intersection Candidates: 425
----------------------------------------
Overlap with Gold:  0.78



In [16]:
# from snorkel.utils import ProgressBar
# target = sorted(list(fn))[-1]
# print target
# print "-------------------------------"
# pb = ProgressBar(len(candidates))
# for i, c in enumerate(list(candidates)[:]):
#     pb.bar(i)
#     if (c.part.parent.document.name.upper() == target[0].upper())
#         and c.part.get_span().upper() == target[1].upper()):
#         print c
# pb.close()
# print "-------------------------------"
for c in sorted(list(fn))[:]:
    print c

('BC546', 'BC548A', '-65')
('BC547', 'BC548A', '-65')
('BC547', 'BC549A', '-65')
('BC547', 'BC550A', '-65')
('BC547', 'BC550B', '-65')
('BC550', 'BC546A', '-65')
('BC550', 'BC546B', '-65')
('BC550', 'BC546C', '-65')
('BC550', 'BC547A', '-65')
('BC550', 'BC547B', '-65')
('BC550', 'BC547C', '-65')
('BC550', 'BC548A', '-65')
('BC550', 'BC548B', '-65')
('BC550', 'BC548C', '-65')
('BC550', 'BC549A', '-65')
('BC550', 'BC549B', '-65')
('BC550', 'BC549C', '-65')
('BC818', 'BC817-16', '-65')
('BC818', 'BC817-25', '-65')
('BC818', 'BC817-40', '-65')
('BC818', 'BC818-16', '-65')
('BC818', 'BC818-25', '-65')
('BC818', 'BC818-40', '-65')
('CSEMS05382-1', 'BC546A', '-65')
('FAIRS19194-1', 'BC856A', '-65')
('FAIRS19194-1', 'BC856B', '-65')
('FAIRS19194-1', 'BC856C', '-65')
('FAIRS19194-1', 'BC857A', '-65')
('FAIRS19194-1', 'BC857B', '-65')
('FAIRS19194-1', 'BC857C', '-65')
('FAIRS19194-1', 'BC858A', '-65')
('FAIRS19194-1', 'BC858B', '-65')
('FAIRS19194-1', 'BC858C', '-65')
('FAIRS19194-1', 'BC859A', 

In [17]:
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()

# for doc in corpus.documents:
#     if doc.name == 'PNJIS00254-1':
#         d = doc
#         break
# print d

In [18]:
# for phrase in d.phrases:
#     if '55' in phrase.words:
#         p = phrase
#         print p.cell
#         import pdb; pdb.set_trace()

In [19]:
# candies = sorted(candidates, key=lambda x: x[0])

In [20]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time count_hardware_labels(candidates, filename, attrib='stg_temp_min', attrib_class='temp')

## TEMPORARY - Return to Normalcy

In [21]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.