In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ.get('SNORKELDB','').startswith('postgres')
print snorkel_postgres

PARALLEL = 30
PARALLEL_EXTRACTION = 8

### Parse

In [None]:
# If necessary:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = 'parts_regex'
    os.system("dropdb " + os.environ['SNORKELDBNAME'])
    os.system("createdb " + os.environ['SNORKELDBNAME'])
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
import os

docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'

if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
   
    # PARSE DEV
    print "Starting async parse..."
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=125, parallel=PARALLEL)
    
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance

    # PARSE DEV
    print "Starting sync parse..."
    doc_parser = HTMLParser(path=docs_path)
    context_parser = OmniParser(blacklist=['style'], flatten=['span','br'], 
                                tabular=True, lingual=True,
                                visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)
    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)

    session.add(corpus)
    session.commit()

print "%s contains %d documents" % (corpus, len(corpus))

In [None]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

### Load

In [None]:
# from snorkel.models import Corpus
# import os

# os.system('cp snorkel.db\ corpus snorkel.db');
# from snorkel import SnorkelSession
# session = SnorkelSession()

# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
# print "%s contains %d Documents" % (corpus, len(corpus))

### Extraction

In [None]:
from snorkel.models import candidate_subclass
from hardware_spaces import get_space
from hardware_matchers import get_matcher

Part = candidate_subclass('Part', ['part'])
part_ngrams = get_space('part')
dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part_rgx', dict_path=dict_path)


In [None]:
# Attempt at alternative regex:
# from snorkel.matchers import RegexMatchSpan, Inverse, Intersect
# long_enough = RegexMatchSpan(rgx='[A-Z2][A-Z0-9\-]{4,}(?:\/(?:DG|HF))?', ignore_case=False)
# at_least_one_number = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[0-9][A-Z0-9\-\/]*', ignore_case=False)
# at_least_one_letter = RegexMatchSpan(rgx='[A-Z0-9\-\/]*[A-Z][A-Z0-9\-\/]*', ignore_case=False) 
# bad_form = Inverse(RegexMatchSpan(rgx='(?:[A-Z]+\d)|(?:[A-Z0-9\-\/]{0,2}\-[A-Z0-9\-\/]*)'))
# bad_start = Inverse(RegexMatchSpan(rgx='(?:T[O0]|SOT|BRD|FIGURE|EHP)[A-Z0-9\-\/]*'))
# bad_end = Inverse(RegexMatchSpan(rgx='[A-Z0-9\-\/]*(TYP|MAX|MIN|BASE|HZ|MM|VDC|VDF)'))
# part_matcher = Intersect(long_enough, at_least_one_number, at_least_one_letter, 
#                          bad_form, bad_start, bad_end)

### Throttlers

In [None]:
# from hardware_throttlers import get_part_throttler_wrapper
from hardware_lfs import get_lfs
from collections import namedtuple

# Option A: No throttler
part_throttler = None

# Option B: Standard throttler
# part_throttler = get_part_throttler_wrapper()

# Option C: LF-based throttler
# FakeCandidate = namedtuple('FakeCandidate',['part','attr'])
# parts_LFs = get_lfs('part')[:-1]
# def part_throttler(Part):
#     c = FakeCandidate(Part[0], None)
#     for lf in parts_LFs:
#         if lf(c) == -1:
#             return False
#     return True

In [None]:
from snorkel.candidates import CandidateExtractor

ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [part_matcher],
                        throttler=part_throttler)

In [None]:
from snorkel.async_candidates import parallel_extract

corpus_name = 'Hardware Dev'
%time candidates = parallel_extract(session, ce, corpus, \
                                    corpus_name + ' Candidates', \
                                    parallel=PARALLEL_EXTRACTION)
print "%s contains %d Candidates" % (candidates, len(candidates))

In [None]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ candidates');

### Second Pass

In [None]:
# If necessary:
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db\ candidates snorkel.db');

In [None]:
# from hardware_utils import get_gold_parts_by_doc, get_manual_parts_by_doc
# from snorkel.utils import get_ORM_instance
# from snorkel.models import Corpus

# corpus = get_ORM_instance(Corpus, session, 'Hardware Dev')

# # parts_by_doc = get_gold_parts_by_doc()
# parts_by_doc = get_manual_parts_by_doc(corpus)
# # parts_by_doc = None

In [None]:
# import cPickle as pickle
# pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'

# with open(pickle_file, 'w') as f:
#     pickle.dump(parts_by_doc, f)

In [None]:
import cPickle as pickle
pickle_file = os.environ['SNORKELHOME'] + '/tutorials/tables/sandbox/parts_by_doc_dev.pkl'
with open(pickle_file, 'r') as f:
    parts_by_doc = pickle.load(f)

### Evaluation

In [None]:
from hardware_utils import get_gold_dict

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)
print "%d gold parts loaded" % len(gold_parts)

In [None]:
from hardware_utils import parts_f1

%time TP, FP, FN = parts_f1(candidates, gold_parts, parts_by_doc)

In [None]:
from pprint import pprint
pprint(FP)

In [None]:
pprint(FN)

In [None]:
from hardware_utils import entity_to_candidates
from snorkel.lf_helpers import *

entity = FP[0]
print entity
print

matches = entity_to_candidates(entity, candidates)
print "# Matches: %d" % len(matches)
print

In [None]:
from hardware_utils import print_table_info

candidate = matches[1]
# print candidate
# print
print_table_info(candidate.part)
# throttler = get_part_throttler()
# print throttler(candidate)

In [None]:
c = candidates[100]
print c
print c.part.get_attrib_tokens('top'), c.part.get_attrib_tokens('bottom')
from snorkel.lf_helpers import *
print get_min_col_num(c)
print get_max_col_num(c)
print list(get_row_ngrams(c))
print_table_info(c.part)
# print list(get_vert_ngrams(c.part))

In [None]:
# from snorkel.lf_helpers import *
# c = candidates[110]
# print c
# get_prev_sibling_tags(c.part).count('p')

The End.