## Part II: `Candidate` Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

In [2]:
%load_ext autoreload
%autoreload 2

from snorkel import SnorkelSession
session = SnorkelSession()

## Loading the Training `Corpus`

First, we will load the `Corpus` that we preprocessed in Part I:

In [3]:
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance

corpus = get_ORM_instance(Corpus, session, 'Hardware Development')
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Development) contains 50 Documents


## Defining a `Candidate` Schema

In [4]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

## Writing a basic `CandidateExtractor`

In [5]:
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

part_ngrams = OmniNgramsPart(n_max=3, split_tokens=None)
temp_ngrams = OmniNgramsTemp(n_max=3, split_tokens=None)

In [6]:
from hardware_utils import load_extended_parts_dict

gold_file ='data/hardware/hardware_gold.csv'
parts_dict = load_extended_parts_dict(gold_file)
print "Loaded %d part numbers." % len(parts_dict)

Loaded 1286 part numbers.


In [13]:
print list(parts_dict)[:100]

['', 'BC857', 'BC856', 'BC857CA', 'BC850', 'BC857CC', 'BC857CB', 'BC 860 C', 'BC 860 B', 'BC 860 A', 'PZT3904A', 'BC859', 'BC858', 'BC184A', 'BC184B', 'BC184C', 'BC860CWA', 'BC860CWC', 'BC860CWB', 'BC239A', 'SMBT3904B', 'BC239C', 'BC239B', 'BC182BC', 'DTC143TE-25', 'BC549AC', 'BC549AB', 'BC549AA', 'MMBT3904-16', 'BC807W-25', 'BC550B-16', 'BC817-25W-25', 'BC817-40A', 'BC859A-16', 'BC237B-16', 'BC847CW', 'BC856C-16', 'BC846AW-16', 'BC807-16W-25', 'BC850B', 'BC850C', 'BC338-25-25', 'DTC114EEB', 'BC808-25-25', 'BC847BW-25', 'BC817-40WC', 'BC817-40WB', 'BC817-40WA', 'BC858C-3L', 'BC548A-40', 'BC846BTC', 'BC846BTB', 'BC859A-Z4A-25', 'MMBT3906B', 'BC856A-16', 'BC858CWC', 'DTC143ZEC', 'BC547B-40', 'BC857B-40', 'NSVBC818-40L-16', 'BC327-40', 'MMBT4124', 'BC546B-16', 'BC547CC', 'BC327-40-16', 'BC859C-25', 'BC808-40', 'BC857CW', 'BC857B-3FB', 'BC848BWA', 'BC 807 W', 'MMBT6427-16', 'BC860A-16', 'BC858C-40', 'BC549C-16', 'BC849BWA', 'BC550C-40', 'BC849BWC', 'BC849BWB', 'DTC114EUA-16', 'BC860C-4GZB'

In [7]:
from snorkel.matchers import DictionaryMatch, RegexMatchSpan

parts_matcher = DictionaryMatch(d=parts_dict, ignore_case=True)
temp_matcher = RegexMatchSpan(rgx=r'-[5-7][05]')

In [8]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part_Temp, [part_ngrams, temp_ngrams], [parts_matcher, temp_matcher])

## Running the `CandidateExtractor`

In [9]:
%time train = ce.extract(corpus.documents, 'Hardware Development Candidates', session)
print "%s contains %d Candidates" % (train, len(train))


CPU times: user 38.3 s, sys: 717 ms, total: 39 s
Wall time: 38.8 s
Candidate Set (Hardware Development Candidates) contains 6293 Candidates


In [10]:
print len(corpus.documents[0].tables)

9


In [11]:
for c in train[:3]:
    print c

Part_Temp(ImplicitSpan("BC807-25", parent=125805, words=[0,0], position=[0]), ImplicitSpan("-55", parent=155184, words=[0,0], position=[0]))
Part_Temp(ImplicitSpan("BC807-25", parent=125805, words=[0,0], position=[0]), ImplicitSpan("-50", parent=155146, words=[0,0], position=[0]))
Part_Temp(ImplicitSpan("BC807-25", parent=125805, words=[0,0], position=[0]), ImplicitSpan("-50", parent=155200, words=[0,0], position=[0]))


### Saving the extracted candidates

In [None]:
session.add(train)
session.commit()

### Reloading the candidates

In [None]:
from snorkel.models import CandidateSet
from snorkel.utils import get_ORM_instance

candidates = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
candidates = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
print "%s contains %d Candidates" % (candidates, len(candidates))

### Repeating for development and test corpora

In [None]:
for corpus_name in ['Hardware Development'] # Skip 'Hardware Test' for now
    corpus = get_ORM_instance(Corpus, session, corpus_name)
#     corpus = session.query(Corpus).filter(Corpus.name == corpus_name).one()
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(c)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

## TEMPORARY - Assessing Total Recall

In [None]:
# TEMP
# from hardware_utils import entity_level_total_recall
# from snorkel.utils import get_ORM_instance

# candidates = get_ORM_instance(CandidateSet, session, 'Hardware Candidates')
# train = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Training Candidates').one()
# dev = session.query(CandidateSet).filter(
#     CandidateSet.name == 'Hardware Development Candidates').one()
# total_set = set([])
# for c in train:
#     total_set.add(c)
# for c in dev:
#     total_set.add(c)

In [None]:
# import os
# gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# (tp, fp, fn) = entity_level_total_recall(candidates, gold_file, relation=True)

In [None]:
# from snorkel.utils import ProgressBar
# target = sorted(list(fn))[-1]
# print target
# print "-------------------------------"
# pb = ProgressBar(len(candidates))
# for i, c in enumerate(list(candidates)[:]):
#     pb.bar(i)
#     if (c.part.parent.document.name.upper() == target[0].upper())
#         and c.part.get_span().upper() == target[1].upper()):
#         print c
# pb.close()
# print "-------------------------------"
# for c in sorted(list(fn))[:]:
#     print c

In [None]:
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()

# for doc in corpus.documents:
#     if doc.name == 'PNJIS00254-1':
#         d = doc
#         break
# print d

In [None]:
# for phrase in d.phrases:
#     if '55' in phrase.words:
#         p = phrase
#         print p.cell
#         import pdb; pdb.set_trace()

In [None]:
# candies = sorted(candidates, key=lambda x: x[0])

In [None]:
# from hardware_utils import count_hardware_labels

# filename = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
# %time count_hardware_labels(candidates, filename, attrib='stg_temp_min', attrib_class='temp')

## TEMPORARY - Return to Normalcy

In [None]:
# If necessary
# import os
# os.system('cp snorkel.db snorkel.db\ candidates');

Next, in Part 3, we will load `Labels` for each of our `Candidates` so that we can evaluate performance.