# HARDWARE

## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

"""
To change attributes:
1) Change ATTRIBUTE and you're good to go
"""
ATTRIBUTE = 'stg_temp_max'
COUNTER = 0
PARALLEL = 20
PARALLEL_EXTRACTION = 8
DEV_SIZE = 125

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ['SNORKELDB'].startswith('postgres')
print snorkel_postgres

True


In [2]:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = ATTRIBUTE + str(COUNTER)
    print os.system("dropdb " + os.environ['SNORKELDBNAME'])
    print os.system("createdb " + os.environ['SNORKELDBNAME'])
    print "SNORKELDBNAME = %s" % os.environ['SNORKELDBNAME']
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

0
0
SNORKELDBNAME = stg_temp_max0


## Parsing

In [3]:
import os
if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
    print "Starting async parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=DEV_SIZE, parallel=PARALLEL)
    print "%s contains %d documents" % (corpus, len(corpus))
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance
    from snorkel.queries import split_corpus

    print "Starting sync parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser(path=docs_path)
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                 tabular=True, lingual=True,
                                 visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)

    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)
    print "%s contains %d documents" % (corpus, len(corpus))
    session.add(corpus)
    session.commit()

Starting async parse...
CPU times: user 408 ms, sys: 224 ms, total: 632 ms
Wall time: 1min 2s
Corpus (Hardware Dev) contains 122 documents


### Save Corpus

In [4]:
# If necessary
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [5]:
from snorkel.models import candidate_subclass

Part_Attr = candidate_subclass('Part_Attr', ['part','attr'])

### Define Matchers

In [6]:
from hardware_matchers import get_matcher

part_matcher = get_matcher('part_rgx')
attr_matcher = get_matcher(ATTRIBUTE)

### Define ContextSpaces

In [7]:
from hardware_spaces import get_space
# TEMP
from hardware_spaces import OmniNgramsPart
    
part_ngrams = OmniNgramsPart(n_max=3, expand=False)
attr_ngrams = get_space(ATTRIBUTE)

### Define Candidate Throttler

In [8]:
from hardware_throttlers import get_throttler

# throttler = get_throttler(ATTRIBUTE)
throttler = None

### Run CandidateExtractor

In [9]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from snorkel.async_candidates import parallel_extract

ce = CandidateExtractor(Part_Attr, 
                        [part_ngrams, attr_ngrams], 
                        [part_matcher, attr_matcher], 
                        throttler=throttler)


for corpus_name in ['Hardware Dev']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = parallel_extract(session, ce, corpus, \
                                        corpus_name + ' Candidates', \
                                        parallel=PARALLEL_EXTRACTION)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Dev)
CPU times: user 36 ms, sys: 96 ms, total: 132 ms
Wall time: 42.2 s
Candidate Set (Hardware Dev Candidates) contains 66405 Candidates


## Apply Rules

In [10]:
from snorkel.lf_helpers import is_horz_aligned

def baseline_rules(c):
    return not c.part.is_tabular() or is_horz_aligned(c)

passing_candidates = set()
for c in candidates:
    if baseline_rules(c):
        passing_candidates.add(c)

print "%d/%d candidates accepted" % (len(passing_candidates), len(candidates))

26224/66405 candidates accepted


In [11]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1_from_candidates
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
(TP, FP, FN) = entity_level_f1_from_candidates(passing_candidates, gold_file, ATTRIBUTE, corpus)

Preparing candidates...
Scoring on Entity-Level Gold Data
Corpus Precision 0.228
Corpus Recall    0.519
Corpus F1        0.317
----------------------------------------
TP: 461 | FP: 1557 | FN: 428



In [12]:
from pprint import pprint
pprint(FP)

[(u'112823', u'BC546', u'180'),
 (u'112823', u'BC546', u'200'),
 (u'112823', u'BC546B', u'180'),
 (u'112823', u'BC546B', u'200'),
 (u'112823', u'BC547', u'180'),
 (u'112823', u'BC547', u'200'),
 (u'112823', u'BC547A', u'180'),
 (u'112823', u'BC547A', u'200'),
 (u'112823', u'BC547AZL1G', u'200'),
 (u'112823', u'BC547B', u'180'),
 (u'112823', u'BC547B', u'200'),
 (u'112823', u'BC547CG', u'200'),
 (u'112823', u'BC547CZL1G', u'180'),
 (u'112823', u'BC547CZL1G', u'200'),
 (u'112823', u'BC548', u'180'),
 (u'112823', u'BC548', u'200'),
 (u'112823', u'BC548B', u'180'),
 (u'112823', u'BC548B', u'200'),
 (u'112823', u'BC548BRL1G', u'180'),
 (u'112823', u'BC548C', u'180'),
 (u'112823', u'BC548C', u'200'),
 (u'2N3906', u'2N3906', u'200'),
 (u'2N3906', u'MMBT3906', u'200'),
 (u'2N3906', u'PZT3906', u'200'),
 (u'2N3906-D', u'2N3906', u'160'),
 (u'2N3906-D', u'2N3906', u'180'),
 (u'2N3906-D', u'2N3906', u'200'),
 (u'2N3906-D', u'2N3906G', u'200'),
 (u'2N3906-D', u'2N3906RL1G', u'200'),
 (u'2N3906-D',

In [13]:
from pprint import pprint
pprint(FN)

[('112823', 'BC546BG', '150'),
 ('112823', 'BC546BRL1', '150'),
 ('112823', 'BC546BRL1G', '150'),
 ('112823', 'BC546BZL1G', '150'),
 ('112823', 'BC547ARL', '150'),
 ('112823', 'BC547ARLG', '150'),
 ('112823', 'BC547AZL1G', '150'),
 ('112823', 'BC547BG', '150'),
 ('112823', 'BC547BRL1G', '150'),
 ('112823', 'BC547C', '150'),
 ('112823', 'BC547CG', '150'),
 ('112823', 'BC547CZL1G', '150'),
 ('112823', 'BC548BG', '150'),
 ('112823', 'BC548BRL1G', '150'),
 ('112823', 'BC548CG', '150'),
 ('112823', 'BC548CZL1G', '150'),
 ('2N3906', '2N3906BU', '150'),
 ('2N3906', '2N3906TA', '150'),
 ('2N3906', '2N3906TAR', '150'),
 ('2N3906', '2N3906TF', '150'),
 ('2N3906', '2N3906TFR', '150'),
 ('2N3906-D', '2N3906G', '150'),
 ('2N3906-D', '2N3906RL1', '150'),
 ('2N3906-D', '2N3906RL1G', '150'),
 ('2N3906-D', '2N3906RLRA', '150'),
 ('2N3906-D', '2N3906RLRAG', '150'),
 ('2N3906-D', '2N3906RLRM', '150'),
 ('2N3906-D', '2N3906RLRMG', '150'),
 ('2N3906-D', '2N3906RLRP', '150'),
 ('2N3906-D', '2N3906RLRPG', '1