In [None]:
import os
os.remove('snorkel.db')

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os, sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

### Parsing

In [None]:
from snorkel import SnorkelSession
session = SnorkelSession()

#### Define Parser

In [None]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
pdf_path  = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
doc_parser = HTMLParser(path=html_path)
context_parser = OmniParser(pdf_path=pdf_path, session=session)
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

#### Run Parser

In [None]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

#### Backup

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

### Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass

Part = candidate_subclass('Part', ['part'])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Define Matchers

In [2]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_rgx = '([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)'
eeca_matcher = RegexMatchSpan(rgx=eeca_rgx, longest_match_only=True)
jedec_rgx = '([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)'
jedec_matcher = RegexMatchSpan(rgx=jedec_rgx, longest_match_only=True)
jis_rgx = '(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})'
jis_matcher = RegexMatchSpan(rgx=jis_rgx, longest_match_only=True)
others_rgx = '((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)'
others_matcher = RegexMatchSpan(rgx=others_rgx, longest_match_only=True)
# parts_rgx = '|'.join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

#### Define ContextSpaces

In [None]:
import os
from collections import defaultdict
from hardware_utils import OmniNgramsPart, get_gold_dict, expand_part_range
from snorkel.utils import ProgressBar
from snorkel.lf_helpers import *
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus
from snorkel.candidates import OmniNgrams

# # 1 Pass to find all suffixes using a suffix matcher
corpus = get_ORM_instance(Corpus, session, 'Hardware')

eeca_suffix = '^(A|B|C|-16|-25|-40|16|25|40)$'
suffix_matcher = RegexMatchSpan(rgx=eeca_suffix, ignore_case=False)

suffix_ngrams = OmniNgrams(n_max=1)

part_ngrams = OmniNgramsPart(n_max=3) # need to expand it automatically

def extract_dicts(contexts, session):
    suffixes_by_doc = defaultdict(set)
    parts_by_doc = defaultdict(set)
    final_set = defaultdict(set)
    
    pb = ProgressBar(len(contexts))
    for i, context in enumerate(contexts):
        pb.bar(i)
        # Extract Suffixes
        for ts in suffix_matcher.apply(suffix_ngrams.apply(context)):
#             cell_ngrams = list(get_cell_ngrams(ts))
            row_ngrams = set(get_row_ngrams(ts, infer=True))
#             aligned_ngrams = set(get_aligned_ngrams(ts))
#             if len(cell_ngrams) < 2 and 'i' not in cell_ngrams:
#                 suffixes_by_doc[ts.parent.document.name.upper()].add(ts.get_span())
            if ('classification' in row_ngrams or 
                'group' in row_ngrams or
                'rank' in row_ngrams or
                'grp.' in row_ngrams):
                suffixes_by_doc[ts.parent.document.name.upper()].add(ts.get_span())
        
        #extract parts
        for ts in parts_matcher.apply(part_ngrams.apply(context)):
            parts_by_doc[ts.parent.document.name.upper()].add(ts.get_span())
    
    pb.close()
    
    # Process suffixes and parts
    for doc in parts_by_doc.keys():
        for part in parts_by_doc[doc]:
            final_set[doc].add(part)
            if not part.endswith(tuple(suffixes_by_doc[doc])):
                for suffix in suffixes_by_doc[doc]:
                    if suffix == "A" or suffix == "B" or suffix == "C":
                        final_set[doc].add(part + suffix)
                    elif not suffix.startswith('-'):
                        final_set[doc].add(part + '-' + suffix)
                    else:
                        final_set[doc].add(part + suffix)

    return final_set, suffixes_by_doc, parts_by_doc
            
parts, suff, parts_by_doc = extract_dicts(corpus.documents, session)      
print len(parts_by_doc)
print len(parts)

In [None]:
# import pickle
# filename = '1stpass.pkl'
# with open(filename, 'w') as f:
#     pickle.dump([parts, suff, parts_by_doc], f)

In [3]:
import pickle
filename = '1stpass.pkl'
with open(filename, 'r') as f:
    [parts, suff, parts_by_doc] = pickle.load(f)

In [4]:
from pprint import pprint
    
pprint(sorted([part for part in suff['LITES00689-1']]))
# print(sorted([x for x in expand_part_range("BC337-025G")]))

[u'16', u'40', u'C']


In [5]:
# Enhance CandidateExtractor to add these suffixes to the parts found in the parts_matcher
from hardware_utils import get_gold_dict
from collections import defaultdict

# Make parts list
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'

# # Gold parts looks like this:
#     # set([('112823', 'BC546'),
#     #      ('112823', 'BC546B'),
#     #      ('112823', 'BC546BG'),            if ts.parent.document.name.upper() == "MOTOS03189-1":
#     #      ('112823', 'BC546BRL1'),
#     #      ('112823', 'BC546BRL1G'),
#     #      ('112823', 'BC546BZL1G'),
#     #      ('112823', 'BC547'),
#     #      ('112823', 'BC547A'),
#     #      ('112823', 'BC547ARL'),
#     #      ('112823', 'BC547ARLG'),
#     #      ('112823', 'BC547AZL1G'),
#     #      ('112823', 'BC547B'),
#     #      ('112823', 'BC547BG'),
gold_parts = get_gold_dict(gold_file, doc_on=True, part_on=True, val_on=False)

gold = set()
gold_parts_by_doc = defaultdict(set)
for part in gold_parts:
    gold_parts_by_doc[part[0]].add(part[1])
    gold.add((part[0], part[1]))
print len(gold)

homemade = set()
for doc, parts in parts_by_doc.iteritems():
    for part in parts:
        homemade.add((doc,part))
print len(homemade)

829
1188


In [6]:
tp = gold.intersection(homemade)
fp = homemade - gold
fn = gold - homemade
print "TP: ", len(tp)
print "FP: ", len(fp)
print "FN: ", len(fn)
print "TOTAL: ", len(tp) + len(fp) + len(fn)

TP:  689
FP:  499
FN:  140
TOTAL:  1328


In [7]:
from pprint import pprint
pprint(fp)

set([(u'112823', u'BC548A'),
     (u'2N3906-D', u'1N916'),
     (u'2N3906-D', u'2N390D'),
     (u'2N4123-D', u'2N412D'),
     (u'2N4123-D', u'2n4123'),
     (u'2N4124', u'1N916'),
     (u'2N6426-D', u'2N642D'),
     (u'AUKCS04635-1', u'2N3904'),
     (u'BC182', u'BC182 NPN'),
     (u'BC337', u'BC327'),
     (u'BC337', u'BC327 /'),
     (u'BC337', u'BC328'),
     (u'BC337', u'BC337 /'),
     (u'BC337', u'BC33716'),
     (u'BC337', u'BC33725'),
     (u'BC337', u'BC33740'),
     (u'BC337', u'BC33740BU'),
     (u'BC337', u'BC33825'),
     (u'BC546', u'BC546 /'),
     (u'BC546', u'BC547 /'),
     (u'BC546', u'BC548 /'),
     (u'BC546', u'BC549 /'),
     (u'BC546', u'BC556'),
     (u'BC546', u'BC557'),
     (u'BC546', u'BC558'),
     (u'BC546', u'BC559'),
     (u'BC546', u'BC560'),
     (u'BC546-BC548C(TO-92)', u'BC546 thru'),
     (u'BC546-BC548C(TO-92)', u'BC546-BC548C'),
     (u'BC546-D', u'BC548A'),
     (u'BC546_DIOTEC', u'BC546 /'),
     (u'BC546_DIOTEC', u'BC548 /'),
     (u'BC546_DIO

In [8]:
from hardware_utils import OmniNgramsPart

part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=3)

#### Run CandidateExtractor

In [9]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance


ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [parts_matcher])

for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 2min 6s, sys: 1.68 s, total: 2min 8s
Wall time: 2min 11s
Candidate Set (Hardware Candidates) contains 48917 Candidates


### Assess Recall

Using the dictionary approach we have

- **24820** candidates for part numbers
- 811 entity-level candidates and **100% recall**.

In [13]:
from hardware_utils import entity_level_total_recall, most_common_document, get_gold_dict
from snorkel.utils import get_ORM_instance
from snorkel.models import Candidate, Corpus

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'

corpus = get_ORM_instance(Corpus, session, 'Hardware')
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, None, corpus=corpus, relation=False)
print "TP: ", len(tp)
print "FP: ", len(fp)
print "FN: ", len(fn)
print "TOTAL: ", len(tp) + len(fp) + len(fn)

Preparing candidates...
Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1161 
Entity-level Gold: 809
Intersection Candidates: 707
----------------------------------------
Overlap with Gold:  0.8739

TP:  707
FP:  454
FN:  102
TOTAL:  1263


In [14]:
from pprint import pprint
fns = list(fn)
pprint(sorted(fns)[:100])

[('BC337', 'BC337-16'),
 ('BC337', 'BC337-25'),
 ('BC337', 'BC337-40'),
 ('BC337', 'BC338-25'),
 ('BC337-D', 'BC337-025G'),
 ('BC337-D', 'BC337-040G'),
 ('BC337-D', 'BC337-25RL1G'),
 ('BC337-D', 'BC337-25RLRAG'),
 ('BC337-D', 'BC337-25ZL1G'),
 ('BC337-D', 'BC337-40RL1G'),
 ('BC337-D', 'BC337-40ZL1G'),
 ('BC550', 'BC546A'),
 ('BC550', 'BC546B'),
 ('BC550', 'BC546C'),
 ('BC550', 'BC547A'),
 ('BC550', 'BC547B'),
 ('BC550', 'BC547C'),
 ('BC550', 'BC548A'),
 ('BC550', 'BC548B'),
 ('BC550', 'BC548C'),
 ('BC550', 'BC549A'),
 ('BC550', 'BC549B'),
 ('BC550', 'BC549C'),
 ('BC550', 'BC550A'),
 ('BC550', 'BC550B'),
 ('BC550', 'BC550C'),
 ('BC818', 'BC817-16'),
 ('BC818', 'BC817-25'),
 ('BC818', 'BC817-40'),
 ('BC818', 'BC818-16'),
 ('BC818', 'BC818-25'),
 ('BC818', 'BC818-40'),
 ('BC818-40LT1-D', 'BC818-40LT1G'),
 ('BC818-40LT1-D', 'NSVBC818-40LT1G'),
 ('FAIRS19194-1', 'BC856A'),
 ('FAIRS19194-1', 'BC856B'),
 ('FAIRS19194-1', 'BC856C'),
 ('FAIRS19194-1', 'BC857A'),
 ('FAIRS19194-1', 'BC857B'),
 ('

In [None]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');