In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ.get('SNORKELDB','').startswith('postgres')
print snorkel_postgres

PARALLEL = 10
PARALLEL_EXTRACTION = 8

True


### Parse

In [2]:
# If necessary:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = 'parts_regex_test'
    os.system("dropdb " + os.environ['SNORKELDBNAME'])
    os.system("createdb " + os.environ['SNORKELDBNAME'])
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
import os

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/test/html/'
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/test/pdf/'

if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
   
    # PARSE TEST
    print "Starting async parse..."
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Test', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=125, parallel=PARALLEL)
    
else:
    pass

print "%s contains %d documents" % (corpus, len(corpus))

Starting async parse...
CPU times: user 284 ms, sys: 148 ms, total: 432 ms
Wall time: 49.4 s
Corpus (Hardware Test) contains 75 documents


### Extraction

In [4]:
from snorkel.models import candidate_subclass
from hardware_spaces import get_space
from hardware_matchers import get_matcher

Part = candidate_subclass('Part', ['part'])
part_ngrams = get_space('part')
dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part', dict_path=dict_path)

Using combined matcher.


In [5]:
# Attempt at alternative regex:
# from snorkel.matchers import RegexMatchSpan, Inverse, Intersect
# long_enough = RegexMatchSpan(rgx='[A-Z2][A-Z0-9\-]{4,}(?:\/(?:DG|HF))?', ignore_case=False)
# at_least_one_number = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[0-9][A-Z0-9\-\/]*', ignore_case=False)
# at_least_one_letter = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[A-Z][A-Z0-9\-\/]*', ignore_case=False) 
# bad_form = Inverse(RegexMatchSpan(rgx='(?:[A-Z]+\d)|(?:[A-Z0-9\-\/]{0,2}\-[A-Z0-9\-\/]*)'))
# bad_start = Inverse(RegexMatchSpan(rgx='(?:T[O0]|SOT|BRD|FIGURE|EHP)[A-Z0-9\-\/]*'))
# bad_end = Inverse(RegexMatchSpan(rgx='[A-Z0-9\-\/]*(TYP|MAX|MIN|BASE|HZ|MM|VDC|VDF)'))
# part_matcher = Intersect(long_enough, at_least_one_number, at_least_one_letter, 
#                          bad_form, bad_start, bad_end)

### Throttlers

In [6]:
# from hardware_throttlers import get_part_throttler_wrapper
from hardware_lfs import get_lfs
from collections import namedtuple

# Option A: No throttler
part_throttler = None

# Option B: Standard throttler
# part_throttler = get_part_throttler_wrapper()

# Option C: LF-based throttler
# FakeCandidate = namedtuple('FakeCandidate',['part','attr'])
# parts_LFs = get_lfs('part')[:-1]
# def part_throttler(Part):
#     c = FakeCandidate(Part[0], None)
#     for lf in parts_LFs:
#         if lf(c) == -1:
#             return False
#     return True

In [7]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [part_matcher],
                        throttler=part_throttler)

In [8]:
from snorkel.async_candidates import parallel_extract
# %time candidates = ce.extract(corpus.documents.all(), 'Hardware Test Candidates', session)

corpus_name = 'Hardware Test'
%time candidates = parallel_extract(session, ce, corpus, \
                                    corpus_name + ' Candidates', \
                                    parallel=PARALLEL_EXTRACTION)
print "%s contains %d Candidates" % (candidates, len(candidates))

CPU times: user 332 ms, sys: 256 ms, total: 588 ms
Wall time: 32.5 s
Candidate Set (Hardware Test Candidates) contains 805 Candidates


### Second Pass

In [9]:
from hardware_utils import get_gold_parts_by_doc, get_manual_parts_by_doc
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware Test')

# parts_by_doc = get_gold_parts_by_doc()
parts_by_doc = get_manual_parts_by_doc(corpus)
# parts_by_doc = None

Finding part numbers...
Appending suffixes...


In [10]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_test.pkl'

# with open(pickle_file, 'w') as f:
#     pickle.dump(parts_by_doc, f)

In [11]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_test.pkl'
# with open(pickle_file, 'r') as f:
#     parts_by_doc = pickle.load(f)

### Evaluation

In [12]:
from hardware_utils import get_gold_dict

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/test/hardware_test_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
print "%d gold parts loaded" % len(gold_parts)

226 gold parts loaded


In [13]:
from hardware_utils import parts_f1

%time TP, FP, FN = parts_f1(candidates, gold_parts, parts_by_doc)

Scoring on Entity-Level Gold Data
Corpus Precision 0.698
Corpus Recall    0.942
Corpus F1        0.802
----------------------------------------
TP: 213 | FP: 92 | FN: 13

CPU times: user 2.28 s, sys: 120 ms, total: 2.4 s
Wall time: 2.94 s


The End.