# HARDWARE

## Setup

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

"""
To change attributes:
1) Change ATTRIBUTE and you're good to go
"""
ATTRIBUTE = 'ce_v_max'
COUNTER = '_baseline'
PARALLEL = 20
PARALLEL_EXTRACTION = 8
DEV_SIZE = 125

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ['SNORKELDB'].startswith('postgres')
print snorkel_postgres

True


In [2]:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = ATTRIBUTE + str(COUNTER)
    print os.system("dropdb " + os.environ['SNORKELDBNAME'])
    print os.system("createdb " + os.environ['SNORKELDBNAME'])
    print "SNORKELDBNAME = %s" % os.environ['SNORKELDBNAME']
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

0
0
SNORKELDBNAME = ce_v_max_baseline


## Parsing

In [3]:
import os
if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
    print "Starting async parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Dev', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=DEV_SIZE, parallel=PARALLEL)
    print "%s contains %d documents" % (corpus, len(corpus))
else:
    from snorkel.parser import CorpusParser, HTMLParser, OmniParser
    from snorkel.utils import get_ORM_instance
    from snorkel.queries import split_corpus

    print "Starting sync parse..."
    
    # PARSE DEV
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/pdf/'
    doc_parser = HTMLParser(path=docs_path)
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                 tabular=True, lingual=True,
                                 visual=True, pdf_path=pdf_path)
    cp = CorpusParser(doc_parser, context_parser, max_docs=125)

    %time corpus = cp.parse_corpus(name='Hardware Dev', session=session)
    print "%s contains %d documents" % (corpus, len(corpus))
    session.add(corpus)
    session.commit()

Starting async parse...
CPU times: user 420 ms, sys: 188 ms, total: 608 ms
Wall time: 1min 17s
Corpus (Hardware Dev) contains 122 documents


### Save Corpus

In [4]:
# If necessary
if not snorkel_postgres:
    import os
    os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [5]:
from snorkel.models import candidate_subclass

Part_Attr = candidate_subclass('Part_Attr', ['part','attr'])

### Define Matchers

In [6]:
from hardware_matchers import get_matcher

part_matcher = get_matcher('part_rgx')
attr_matcher = get_matcher(ATTRIBUTE)

### Define ContextSpaces

In [7]:
from hardware_spaces import get_space
# TEMP
from hardware_spaces import OmniNgramsPart
    
part_ngrams = OmniNgramsPart(n_max=3, expand=False)
attr_ngrams = get_space(ATTRIBUTE)

### Run CandidateExtractor

In [8]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from snorkel.async_candidates import parallel_extract

ce = CandidateExtractor(Part_Attr, 
                        [part_ngrams, attr_ngrams], 
                        [part_matcher, attr_matcher], 
                        throttler=None)


for corpus_name in ['Hardware Dev']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = parallel_extract(session, ce, corpus, \
                                        corpus_name + ' Candidates', \
                                        parallel=PARALLEL_EXTRACTION)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Dev)
CPU times: user 412 ms, sys: 280 ms, total: 692 ms
Wall time: 2min 57s
Candidate Set (Hardware Dev Candidates) contains 630170 Candidates


## Apply Rules

In [10]:
from snorkel.lf_helpers import is_horz_aligned, get_horz_ngrams, overlap
from snorkel.utils import ProgressBar

def baseline_rules(c):
    if ((not c.part.is_tabular() or is_horz_aligned(c)) and 
        overlap(['collector-emitter', 'collector emitter', 'collector - emitter'], get_horz_ngrams(c.attr))):
        return True
    else:
        return False

passing_candidates = set()
pb = ProgressBar(len(candidates))
for i, c in enumerate(candidates):
    pb.bar(i)
    if baseline_rules(c):
        passing_candidates.add(c)
pb.close()
        
print "%d/%d candidates accepted" % (len(passing_candidates), len(candidates))

7290/630170 candidates accepted


In [12]:
from snorkel.models import Corpus
from hardware_utils import entity_level_f1
import os

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Dev').one()
(TP, FP, FN) = entity_level_f1(passing_candidates, gold_file, ATTRIBUTE, corpus)

Preparing candidates...
Scoring on Entity-Level Gold Data
Corpus Precision 0.135
Corpus Recall    0.245
Corpus F1        0.174
----------------------------------------
TP: 211 | FP: 1348 | FN: 651



In [None]:
from pprint import pprint
pprint(FP)

In [None]:
from pprint import pprint
pprint(FN)