# HARDWARE

## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

"""
To change attributes:
1) Change ATTRIBUTE and you're good to go
"""
ATTRIBUTE = 'polarity'
COUNTER = '_baseline2'
PARALLEL = 20
PARALLEL_EXTRACTION = 8
DEV_SIZE = 125

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ['SNORKELDB'].startswith('postgres')
print snorkel_postgres

True


In [2]:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = ATTRIBUTE + str(COUNTER)
    print os.system("dropdb " + os.environ['SNORKELDBNAME'])
    print os.system("createdb " + os.environ['SNORKELDBNAME'])
    print "SNORKELDBNAME = %s" % os.environ['SNORKELDBNAME']
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

256
0
SNORKELDBNAME = polarity_baseline2


## Parsing

In [3]:
import os
if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
    print "Starting async parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=DEV_SIZE, parallel=PARALLEL)
    print "%s contains %d documents" % (corpus, len(corpus))
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance
    from snorkel.queries import split_corpus

    print "Starting sync parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser(path=docs_path)
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                 tabular=True, lingual=True,
                                 visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)

    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)
    print "%s contains %d documents" % (corpus, len(corpus))
    session.add(corpus)
    session.commit()

Starting async parse...
CPU times: user 412 ms, sys: 180 ms, total: 592 ms
Wall time: 1min 16s
Corpus (Hardware Dev) contains 122 documents


### Save Corpus

In [4]:
# If necessary
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [5]:
from snorkel.models import candidate_subclass

Part_Attr = candidate_subclass('Part_Attr', ['part','attr'])

### Define Matchers

In [6]:
from hardware_matchers import get_matcher

part_matcher = get_matcher('part_rgx')
attr_matcher = get_matcher(ATTRIBUTE)

### Define ContextSpaces

In [7]:
from hardware_spaces import get_space
from hardware_spaces import OmniNgramsPart
    
part_ngrams = OmniNgramsPart(n_max=3, expand=False)
attr_ngrams = get_space(ATTRIBUTE)

### Run CandidateExtractor

In [8]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from snorkel.async_candidates import parallel_extract

ce = CandidateExtractor(Part_Attr, 
                        [part_ngrams, attr_ngrams], 
                        [part_matcher, attr_matcher], 
                        throttler=None)


for corpus_name in ['Hardware Dev']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = parallel_extract(session, ce, corpus, \
                                        corpus_name + ' Candidates', \
                                        parallel=PARALLEL_EXTRACTION)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Dev)
CPU times: user 308 ms, sys: 184 ms, total: 492 ms
Wall time: 20.3 s
Candidate Set (Hardware Dev Candidates) contains 32156 Candidates


## Apply Rules

In [9]:
from snorkel.lf_helpers import is_horz_aligned, get_horz_ngrams

def baseline_rules(c):
    if ((not c.part.is_tabular() or is_horz_aligned(c))):
        return True
    else:
        return False

passing_candidates = set()
for c in candidates:
    if baseline_rules(c):
        passing_candidates.add(c)

print "%d/%d candidates accepted" % (len(passing_candidates), len(candidates))

15300/32156 candidates accepted


In [10]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1_from_candidates
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
(TP, FP, FN) = entity_level_f1_from_candidates(passing_candidates, gold_file, ATTRIBUTE, corpus)

Preparing candidates...
Scoring on Entity-Level Gold Data
Corpus Precision 0.442
Corpus Recall    0.483
Corpus F1        0.462
----------------------------------------
TP: 448 | FP: 565 | FN: 479



In [11]:
from pprint import pprint
pprint(FP)

[(u'AUKCS04635-1', u'2N3904', u'PNP'),
 (u'BC337', u'BC327', u'NPN'),
 (u'BC337', u'BC328', u'NPN'),
 (u'BC337-D', u'BC337-XX', u'NPN'),
 (u'BC546', u'BC556', u'NPN'),
 (u'BC546', u'BC557', u'NPN'),
 (u'BC546', u'BC558', u'NPN'),
 (u'BC546', u'BC559', u'NPN'),
 (u'BC546', u'BC560', u'NPN'),
 (u'BC546-BC548C(TO-92)', u'BC546-BC548C', u'NPN'),
 (u'BC546_DIOTEC', u'BC546', u'PNP'),
 (u'BC546_DIOTEC', u'BC546A', u'PNP'),
 (u'BC546_DIOTEC', u'BC546B', u'PNP'),
 (u'BC546_DIOTEC', u'BC547', u'PNP'),
 (u'BC546_DIOTEC', u'BC547A', u'PNP'),
 (u'BC546_DIOTEC', u'BC547B', u'PNP'),
 (u'BC546_DIOTEC', u'BC547C', u'PNP'),
 (u'BC546_DIOTEC', u'BC548', u'PNP'),
 (u'BC546_DIOTEC', u'BC548A', u'PNP'),
 (u'BC546_DIOTEC', u'BC548B', u'PNP'),
 (u'BC546_DIOTEC', u'BC548C', u'PNP'),
 (u'BC546_DIOTEC', u'BC549', u'PNP'),
 (u'BC546_DIOTEC', u'BC549B', u'PNP'),
 (u'BC546_DIOTEC', u'BC549C', u'PNP'),
 (u'BC546_DIOTEC', u'BC556', u'NPN'),
 (u'BC546_DIOTEC', u'BC556', u'PNP'),
 (u'BC546_DIOTEC', u'BC557', u'NPN'),


In [12]:
from pprint import pprint
pprint(FN)

[('112823', 'BC546BG', 'NPN'),
 ('112823', 'BC546BRL1', 'NPN'),
 ('112823', 'BC546BZL1G', 'NPN'),
 ('112823', 'BC547ARL', 'NPN'),
 ('112823', 'BC547ARLG', 'NPN'),
 ('112823', 'BC547AZL1G', 'NPN'),
 ('112823', 'BC547BG', 'NPN'),
 ('112823', 'BC547BRL1G', 'NPN'),
 ('112823', 'BC547BZL1G', 'NPN'),
 ('112823', 'BC547C', 'NPN'),
 ('112823', 'BC547CG', 'NPN'),
 ('112823', 'BC547CZL1G', 'NPN'),
 ('112823', 'BC548BG', 'NPN'),
 ('112823', 'BC548BRL1G', 'NPN'),
 ('112823', 'BC548BZL1G', 'NPN'),
 ('112823', 'BC548CG', 'NPN'),
 ('112823', 'BC548CZL1G', 'NPN'),
 ('2N3906', '2N3906BU', 'PNP'),
 ('2N3906', '2N3906TA', 'PNP'),
 ('2N3906', '2N3906TAR', 'PNP'),
 ('2N3906', '2N3906TF', 'PNP'),
 ('2N3906', '2N3906TFR', 'PNP'),
 ('2N3906-D', '2N3906G', 'PNP'),
 ('2N3906-D', '2N3906RLRA', 'PNP'),
 ('2N3906-D', '2N3906RLRAG', 'PNP'),
 ('2N3906-D', '2N3906RLRM', 'PNP'),
 ('2N3906-D', '2N3906RLRMG', 'PNP'),
 ('2N3906-D', '2N3906RLRP', 'PNP'),
 ('2N3906-D', '2N3906RLRPG', 'PNP'),
 ('2N4123-D', '2N4123RLRM', 'NP