In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ.get('SNORKELDB','').startswith('postgres')
print snorkel_postgres

PARALLEL = 3
PARALLEL_EXTRACTION = 3

False


### Parse

In [2]:
# If necessary:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = 'parts_regex_dev'
    os.system("dropdb " + os.environ['SNORKELDBNAME'])
    os.system("createdb " + os.environ['SNORKELDBNAME'])
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

In [3]:
import os

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/DISES00490-1.html'
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'

if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
   
    # PARSE DEV
    print "Starting async parse..."
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=125, parallel=PARALLEL)
    
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance

    # PARSE DEV
    print "Starting sync parse..."
    doc_parser = HTMLParser(path=docs_path)
    context_parser = OmniParser(blacklist=['style'], flatten=['span','br'], 
                                tabular=True, lingual=True,
                                visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)
    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)

    session.add(corpus)
    session.commit()

print "%s contains %d documents" % (corpus, len(corpus))

Starting sync parse...
CPU times: user 556 ms, sys: 40 ms, total: 596 ms
Wall time: 1.87 s
Corpus (Hardware Dev) contains 1 documents


In [4]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

### Load

In [5]:
from snorkel.models import Corpus
import os, sys

os.system('cp snorkel.db\ corpus snorkel.db');
from snorkel import SnorkelSession
session = SnorkelSession()

sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = False
print snorkel_postgres

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
print "%s contains %d Documents" % (corpus, len(corpus))

False
Corpus (Hardware Dev) contains 1 Documents


### Extraction

In [6]:
from snorkel.models import candidate_subclass
from hardware_spaces import get_space
from hardware_matchers import get_matcher

Part = candidate_subclass('Part', ['part'])
part_ngrams = get_space('part')
dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part', dict_path=dict_path)

Using combined matcher.


In [7]:
# Attempt at alternative regex:
# from snorkel.matchers import RegexMatchSpan, Inverse, Intersect
# long_enough = RegexMatchSpan(rgx='[A-Z2][A-Z0-9\-]{4,}(?:\/(?:DG|HF))?', ignore_case=False)
# at_least_one_number = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[0-9][A-Z0-9\-\/]*', ignore_case=False)
# at_least_one_letter = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[A-Z][A-Z0-9\-\/]*', ignore_case=False) 
# bad_form = Inverse(RegexMatchSpan(rgx='(?:[A-Z]+\d)|(?:[A-Z0-9\-\/]{0,2}\-[A-Z0-9\-\/]*)'))
# bad_start = Inverse(RegexMatchSpan(rgx='(?:T[O0]|SOT|BRD|FIGURE|EHP)[A-Z0-9\-\/]*'))
# bad_end = Inverse(RegexMatchSpan(rgx='[A-Z0-9\-\/]*(TYP|MAX|MIN|BASE|HZ|MM|VDC|VDF)'))
# part_matcher = Intersect(long_enough, at_least_one_number, at_least_one_letter, 
#                          bad_form, bad_start, bad_end)

### Throttlers

In [8]:
# from hardware_throttlers import get_part_throttler_wrapper
from hardware_lfs import get_lfs
from collections import namedtuple
from pdb import set_trace as t

# Option A: No throttler
part_throttler = None

# Option B: Standard throttler
# part_throttler = get_part_throttler_wrapper()

# Option C: LF-based throttler
# FakeCandidate = namedtuple('FakeCandidate',['part','attr'])
# parts_LFs = get_lfs('part')[:-1]
# def part_throttler(Part):
#     c = FakeCandidate(Part[0], None)
#     for lf in parts_LFs:
#         if lf(c) == -1:
#             return False
#     return True

# Option D: Testing throttler
# def part_throttler(Part):
#     part = Part[0]
#     if part.get_span() == '2N1613':
#         t()
#     return True

In [9]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [part_matcher],
                        throttler=part_throttler)

In [10]:
from snorkel.lf_helpers import *
corpus_name = 'Hardware Dev'

if snorkel_postgres:
    from snorkel.async_candidates import parallel_extract
    %time candidates = parallel_extract(session, ce, corpus, \
                                    corpus_name + ' Candidates', \
                                    parallel=PARALLEL_EXTRACTION)
else:   
    from snorkel.candidates import CandidateExtractor
    %time candidates = ce.extract(corpus.documents.all(), corpus_name + ' Candidates', session)

print "%s contains %d Candidates" % (candidates, len(candidates))

CPU times: user 574 ms, sys: 17.5 ms, total: 591 ms
Wall time: 933 ms
Candidate Set (Hardware Dev Candidates) contains 38 Candidates


In [11]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ candidates');

### Second Pass

In [12]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db\ candidates snorkel.db');

In [13]:
from hardware_utils import get_gold_parts_by_doc, get_manual_parts_by_doc
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware Dev')

# parts_by_doc = get_gold_parts_by_doc()
parts_by_doc = get_manual_parts_by_doc(corpus)
# parts_by_doc = None

Finding part numbers...
[set([u'A', u'C', u'B'])]
> /Users/bradenhancock/snorkel/tutorials/tables/hardware_utils.py(377)generate_parts_by_doc()
-> if any(s in part[5:] for part in parts):
(Pdb) c
> /Users/bradenhancock/snorkel/tutorials/tables/hardware_utils.py(376)generate_parts_by_doc()
-> import pdb; pdb.set_trace()
(Pdb) c
> /Users/bradenhancock/snorkel/tutorials/tables/hardware_utils.py(377)generate_parts_by_doc()
-> if any(s in part[5:] for part in parts):
(Pdb) c
[set([])]
Appending suffixes...


In [14]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'

# with open(pickle_file, 'w') as f:
#     pickle.dump(parts_by_doc, f)

In [15]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'
# with open(pickle_file, 'r') as f:
#     parts_by_doc = pickle.load(f)

### Evaluation

In [16]:
from hardware_utils import get_gold_dict

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
print "%d gold parts loaded" % len(gold_parts)

927 gold parts loaded


In [17]:
from hardware_utils import parts_f1

%time TP, FP, FN = parts_f1(candidates, gold_parts, parts_by_doc)

Scoring on Entity-Level Gold Data
Corpus Precision 1.0
Corpus Recall    0.0151
Corpus F1        0.0298
----------------------------------------
TP: 14 | FP: 0 | FN: 913

CPU times: user 103 ms, sys: 3.9 ms, total: 107 ms
Wall time: 106 ms


In [18]:
from pprint import pprint
pprint(TP)

[(u'DISES00490-1', u'BC846AW'),
 (u'DISES00490-1', u'BC846BW'),
 (u'DISES00490-1', u'BC846W'),
 (u'DISES00490-1', u'BC847AW'),
 (u'DISES00490-1', u'BC847BW'),
 (u'DISES00490-1', u'BC847CW'),
 (u'DISES00490-1', u'BC847W'),
 (u'DISES00490-1', u'BC848AW'),
 (u'DISES00490-1', u'BC848BW'),
 (u'DISES00490-1', u'BC848CW'),
 (u'DISES00490-1', u'BC848W'),
 (u'DISES00490-1', u'BC849BW'),
 (u'DISES00490-1', u'BC849CW'),
 (u'DISES00490-1', u'BC849W')]


In [19]:
pprint(FP)

[]


In [20]:
pprint(FN)

[('112823', 'BC546'),
 ('112823', 'BC546B'),
 ('112823', 'BC546BG'),
 ('112823', 'BC546BRL1'),
 ('112823', 'BC546BRL1G'),
 ('112823', 'BC546BZL1G'),
 ('112823', 'BC547'),
 ('112823', 'BC547A'),
 ('112823', 'BC547ARL'),
 ('112823', 'BC547ARLG'),
 ('112823', 'BC547AZL1G'),
 ('112823', 'BC547B'),
 ('112823', 'BC547BG'),
 ('112823', 'BC547BRL1G'),
 ('112823', 'BC547BZL1G'),
 ('112823', 'BC547C'),
 ('112823', 'BC547CG'),
 ('112823', 'BC547CZL1G'),
 ('112823', 'BC548'),
 ('112823', 'BC548B'),
 ('112823', 'BC548BG'),
 ('112823', 'BC548BRL1G'),
 ('112823', 'BC548BZL1G'),
 ('112823', 'BC548C'),
 ('112823', 'BC548CG'),
 ('112823', 'BC548CZL1G'),
 ('2N3906', '2N3906'),
 ('2N3906', '2N3906BU'),
 ('2N3906', '2N3906TA'),
 ('2N3906', '2N3906TAR'),
 ('2N3906', '2N3906TF'),
 ('2N3906', '2N3906TFR'),
 ('2N3906', 'MMBT3906'),
 ('2N3906', 'PZT3906'),
 ('2N3906-D', '2N3906'),
 ('2N3906-D', '2N3906G'),
 ('2N3906-D', '2N3906RL1'),
 ('2N3906-D', '2N3906RL1G'),
 ('2N3906-D', '2N3906RLRA'),
 ('2N3906-D', '2N390

In [21]:
from hardware_utils import entity_to_candidates
from snorkel.lf_helpers import *

entity = FP[0]
print entity
print

matches = entity_to_candidates(entity, candidates)
print "# Matches: %d" % len(matches)
print

IndexError: list index out of range

In [None]:
from hardware_utils import print_table_info

candidate = matches[1]
# print candidate
# print
print_table_info(candidate.part)
# throttler = get_part_throttler()
# print throttler(candidate)

In [None]:
c = candidates[100]
print c
print c.part.get_attrib_tokens('top'), c.part.get_attrib_tokens('bottom')
from snorkel.lf_helpers import *
print get_min_col_num(c)
print get_max_col_num(c)
print list(get_row_ngrams(c))
print_table_info(c.part)
# print list(get_vert_ngrams(c.part))

In [None]:
# from snorkel.lf_helpers import *
# c = candidates[110]
# print c
# get_prev_sibling_tags(c.part).count('p')

The End.