In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ.get('SNORKELDB','').startswith('postgres')
print snorkel_postgres

PARALLEL = 50
PARALLEL_EXTRACTION = 8

True


### Parse

In [2]:
# If necessary:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = 'parts_regex_dev'
    os.system("dropdb " + os.environ['SNORKELDBNAME'])
    os.system("createdb " + os.environ['SNORKELDBNAME'])
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
import os

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'

if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
   
    # PARSE DEV
    print "Starting async parse..."
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=125, parallel=PARALLEL)
    
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance

    # PARSE DEV
    print "Starting sync parse..."
    doc_parser = HTMLParser(path=docs_path)
    context_parser = OmniParser(blacklist=['style'], flatten=['span','br'], 
                                tabular=True, lingual=True,
                                visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)
    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)

    session.add(corpus)
    session.commit()

print "%s contains %d documents" % (corpus, len(corpus))

Starting async parse...
CPU times: user 356 ms, sys: 412 ms, total: 768 ms
Wall time: 51.1 s
Corpus (Hardware Dev) contains 122 documents


In [4]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

### Load

In [5]:
# from snorkel.models import Corpus
# import os

# os.system('cp snorkel.db\ corpus snorkel.db');
# from snorkel import SnorkelSession
# session = SnorkelSession()

# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
# print "%s contains %d Documents" % (corpus, len(corpus))

### Extraction

In [6]:
from snorkel.models import candidate_subclass
from hardware_spaces import get_space
from hardware_matchers import get_matcher

Part = candidate_subclass('Part', ['part'])
part_ngrams = get_space('part')
dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part', dict_path=dict_path)

Using combined matcher.


In [7]:
# Attempt at alternative regex:
# from snorkel.matchers import RegexMatchSpan, Inverse, Intersect
# long_enough = RegexMatchSpan(rgx='[A-Z2][A-Z0-9\-]{4,}(?:\/(?:DG|HF))?', ignore_case=False)
# at_least_one_number = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[0-9][A-Z0-9\-\/]*', ignore_case=False)
# at_least_one_letter = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[A-Z][A-Z0-9\-\/]*', ignore_case=False) 
# bad_form = Inverse(RegexMatchSpan(rgx='(?:[A-Z]+\d)|(?:[A-Z0-9\-\/]{0,2}\-[A-Z0-9\-\/]*)'))
# bad_start = Inverse(RegexMatchSpan(rgx='(?:T[O0]|SOT|BRD|FIGURE|EHP)[A-Z0-9\-\/]*'))
# bad_end = Inverse(RegexMatchSpan(rgx='[A-Z0-9\-\/]*(TYP|MAX|MIN|BASE|HZ|MM|VDC|VDF)'))
# part_matcher = Intersect(long_enough, at_least_one_number, at_least_one_letter, 
#                          bad_form, bad_start, bad_end)

### Throttlers

In [8]:
# from hardware_throttlers import get_part_throttler_wrapper
from hardware_lfs import get_lfs
from collections import namedtuple

# Option A: No throttler
part_throttler = None

# Option B: Standard throttler
# part_throttler = get_part_throttler_wrapper()

# Option C: LF-based throttler
# FakeCandidate = namedtuple('FakeCandidate',['part','attr'])
# parts_LFs = get_lfs('part')[:-1]
# def part_throttler(Part):
#     c = FakeCandidate(Part[0], None)
#     for lf in parts_LFs:
#         if lf(c) == -1:
#             return False
#     return True

In [9]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [part_matcher],
                        throttler=part_throttler)

In [10]:
from snorkel.async_candidates import parallel_extract

corpus_name = 'Hardware Dev'
%time candidates = parallel_extract(session, ce, corpus, \
                                    corpus_name + ' Candidates', \
                                    parallel=PARALLEL_EXTRACTION)
print "%s contains %d Candidates" % (candidates, len(candidates))

CPU times: user 452 ms, sys: 328 ms, total: 780 ms
Wall time: 1min 4s
Candidate Set (Hardware Dev Candidates) contains 3526 Candidates


In [11]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ candidates');

### Second Pass

In [12]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db\ candidates snorkel.db');

In [13]:
from hardware_utils import get_gold_parts_by_doc, get_manual_parts_by_doc
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware Dev')

# parts_by_doc = get_gold_parts_by_doc()
parts_by_doc = get_manual_parts_by_doc(corpus)
# parts_by_doc = None

Finding part numbers...
Appending suffixes...


In [22]:
import cPickle as pickle
pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'

with open(pickle_file, 'w') as f:
    pickle.dump(parts_by_doc, f)

In [15]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'
# with open(pickle_file, 'r') as f:
#     parts_by_doc = pickle.load(f)

### Evaluation

In [16]:
from hardware_utils import get_gold_dict

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
print "%d gold parts loaded" % len(gold_parts)

927 gold parts loaded


In [17]:
from hardware_utils import parts_f1

%time TP, FP, FN = parts_f1(candidates, gold_parts, parts_by_doc)

Scoring on Entity-Level Gold Data
Corpus Precision 0.886
Corpus Recall    0.943
Corpus F1        0.913
----------------------------------------
TP: 874 | FP: 113 | FN: 53

CPU times: user 10.6 s, sys: 376 ms, total: 11 s
Wall time: 13.6 s


In [23]:
from pprint import pprint
print len(FP)
pprint(FP)

113
[(u'2N6427', u'MPSA14'),
 (u'BC337', u'BC33740BU'),
 (u'BC337', u'BC338-16'),
 (u'BC337', u'BC338-40'),
 (u'BC337-D', u'BC337-XX'),
 (u'BC546-BC548C(TO-92)', u'BC546-BC548C'),
 (u'BC546_DIOTEC', u'BC546C'),
 (u'BC546_DIOTEC', u'BC549A'),
 (u'BC547', u'BC548BU'),
 (u'BC818-40LT1-D', u'BC818-40LT1'),
 (u'BOURNSINC_BD246BS', u'TIS633AA'),
 (u'BOURNSINC_TIP152S', u'BY205-600'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CMPT5401'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CMPT5551'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CMPT5551E'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CXT5401'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CXT5401E'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CXT5551'),
 (u'CENTRALSEMICONDUCTORCORP_CMPT5401ETR', u'CXT5551E'),
 (u'CSEMS05382-1', u'BC546-48ABC'),
 (u'DIODESINCORPORATED_FCX491ATA', u'FCX491AQTA'),
 (u'DIODESINCORPORATED_FCX491ATA', u'FCX491ATA'),
 (u'DIODESINCORPORATED_ZXT690BKTC', u'ZXT690B'),
 (u'DIODS13249-1', u'BC81840'),
 (u

In [19]:
pprint(FN)

[('DIODS13249-1', 'BC817-40'),
 ('DIODS13249-1', 'BC818-40'),
 ('DISES00189-1', 'BC546B'),
 ('DISES00189-1', 'BC547B'),
 ('DISES00189-1', 'BC548B'),
 ('DISES00189-1', 'BC549B'),
 ('KECCS03676-1', 'BC858A'),
 ('KECCS03676-1', 'BC858B'),
 ('LITES00689-1', 'BC337-16'),
 ('LITES00689-1', 'BC337-25'),
 ('LITES00689-1', 'BC337-40'),
 ('MICROCOMMERCIALCO_2N3904AP', '2N3904-AP'),
 ('MICROCOMMERCIALCO_2N3904AP', '2N3904-AP-HF'),
 ('MICROCOMMERCIALCO_2N3904AP', '2N3904-BP'),
 ('MICROCOMMERCIALCO_2N3904AP', '2N3904-BP-HF'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29A-BP'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29A-BP-HF'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29B-BP'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29B-BP-HF'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29C-BP'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP29C-BP-HF'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP30A-BP'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP30A-BP-HF'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP30B-BP'),
 ('MICROCOMMERCIALCO_TIP29ABP', 'TIP30B-BP-HF'),
 ('MIC

In [20]:
from hardware_utils import entity_to_candidates
from snorkel.lf_helpers import *

entity = FP[0]
print entity
print

matches = entity_to_candidates(entity, candidates)
print "# Matches: %d" % len(matches)
print

(u'2N6427', u'MPSA14')

# Matches: 1



In [21]:
from hardware_utils import print_table_info

candidate = matches[1]
# print candidate
# print
print_table_info(candidate.part)
# throttler = get_part_throttler()
# print throttler(candidate)

IndexError: list index out of range

In [None]:
# c = candidates[100]
# print c
# print c.part.get_attrib_tokens('top'), c.part.get_attrib_tokens('bottom')
# from snorkel.lf_helpers import *
# print get_min_col_num(c)
# print get_max_col_num(c)
# print list(get_row_ngrams(c))
# print_table_info(c.part)
# print list(get_vert_ngrams(c.part))

The End.