# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.remove('snorkel.db')

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from pdb import set_trace as t

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Step 1: Coordinate Extraction

In [2]:
import os

pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
filename = 'bc546-d'
pdf_file = pdf_path + filename + '.pdf' # Path to PDF file 

In [3]:
from visual_linking import extract_coordinates

pdf_word_list, coordinate_map = extract_coordinates(pdf_file)

In [4]:
from pprint import pprint

pprint(pdf_word_list[:5])
pprint(coordinate_map.items()[:5])

[((1, 0), u'BC546B,'),
 ((1, 1), u'BC547A,'),
 ((1, 2), u'B,'),
 ((1, 3), u'C,'),
 ((1, 4), u'BC548B,')]
[((3, 271), (3, 645, 325, 653, 332)),
 ((4, 135), (4, 638, 314, 645, 322)),
 ((5, 31), (5, 138, 434, 148, 451)),
 ((6, 167), (6, 349, 367, 355, 385)),
 ((1, 215), (1, 575, 453, 584, 460))]


### Step 2: PDF to HTML Conversion

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

### Step 3: HTML Parsing

In [5]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
filename = 'bc546-d'
html_file = html_path + filename + '.html'
doc_parser = HTMLParser(path=html_file)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

%time corpus = cp.parse_corpus(name='Hardware', session=session)

CPU times: user 3.12 s, sys: 163 ms, total: 3.28 s
Wall time: 5.74 s


In [6]:
from visual_linking import extract_words
from pprint import pprint

html_word_list = extract_words(corpus)

print len(pdf_word_list)
print len(html_word_list)
pprint(html_word_list[:5])

2199
2662
[((2, 0), u'BC546'),
 ((2, 1), u'-'),
 ((2, 2), u'NPN'),
 ((2, 3), u'Amplifier'),
 ((2, 4), u'Transistors')]


### Step 4: Visual Linking

In [7]:
from visual_linking import link_lists

%time links = link_lists(html_word_list, pdf_word_list)

CPU times: user 439 ms, sys: 3.22 ms, total: 442 ms
Wall time: 445 ms


In [8]:
pprint(links.items()[:5])

[((2, 0), (1, 31)),
 ((2, 1), (1, 23)),
 ((2, 2), (1, 8)),
 ((2, 3), (1, 6)),
 ((2, 4), (1, 7))]


### Step 5: Updating with coordinates

In [9]:
from visual_linking import load_coordinates

load_coordinates(corpus, links, coordinate_map, session)

In [10]:
print corpus.documents[0].phrases[1].words
print corpus.documents[0].phrases[1].top
print corpus.documents[0].phrases[1].bottom
print corpus.documents[0].phrases[1].left
print corpus.documents[0].phrases[1].right

[u'BC546B', u',', u'BC547A', u',', u'B', u',', u'C', u',', u'BC548B', u',', u'C']
(50, 182, 50, 182, 182, 182, 70, 70, 70, 70, 70)
(73, 197, 73, 197, 197, 197, 93, 93, 93, 93, 93)
(59, 59, 149, 59, 59, 59, 149, 149, 59, 149, 149)
(143, 65, 234, 65, 65, 65, 163, 163, 143, 163, 163)


In [11]:
import os
from collections import defaultdict
from snorkel.models import Corpus, candidate_subclass
from snorkel.matchers import RegexMatchSpan, Union
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Candidate Type
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
    
# CandidateSpaces
part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3)
temp_ngrams = OmniNgramsTemp(n_max=2)

# Matchers
eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'1[4-6]0', longest_match_only=False)

# Throttler
part_throttler = None

# Extractor
ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        throttler=part_throttler)

# Extract
for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 1.28 s, sys: 15.1 ms, total: 1.3 s
Wall time: 1.33 s
Candidate Set (Hardware Candidates) contains 540 Candidates


In [12]:
c = candidates[5]
print c.part
print c.part.char_start, c.part.char_end
print c.part.get_word_start()
print c.part.get_word_end()
print c.part.parent.page
print c.part.get_attrib_tokens('top')
print c.part.get_attrib_tokens('bottom')
print c.part.get_attrib_tokens('left')
print c.part.get_attrib_tokens('right')

Span("BC546", parent=1005, chars=[0,4], words=[0,0])
0 4
0
0
2
(186,)
(196,)
(314,)
(339,)


### Step 6: Assess Results

In [24]:
from visual_linking import display_candidates, display_boxes

display_candidates(pdf_file, candidates, page_num=1)

Boxes per page:
Page 1: 892 (24)
Page 2: 2988 (72)
Page 3: 40 (4)
Page 4: 40 (8)
Page 5: 360 (26)


In [18]:
from visual_linking import display_boxes, get_box

boxes = []
for c in candidates:
    boxes.append(get_box(c.part))
boxes = list(set(boxes))
display_boxes(pdf_file, boxes, page_num=2)

Boxes per page:
Page 1: 8
Page 2: 30
Page 3: 1
Page 4: 2
Page 5: 9


### Display Ordering of PDF Word List 

Display Ordering on a black page - Not very easy to display because of superposition

In [29]:
import numpy as np
import cv2

page_num = 2
page_height = 792
page_width = 612
img = np.zeros((page_height,page_width,3))
font = cv2.FONT_HERSHEY_SIMPLEX
i = 0
for word_id, _ in pdf_word_list:
    if word_id[0] == page_num:
        i += 1
        __, top, left, bottom, right = coordinate_map[word_id]
        cv2.rectangle(img,(int(float(left)),int(float(top))),(int(float(right)),int(float(bottom))),(0,255,0),1)
        cv2.putText(img, str(i), (int((float(left)+float(right))/2), int(float(bottom))), font, 0.3, (0,0,255), 1)
cv2.imshow('Ordering',img)
cv2.waitKey() # press any key to exit the opencv output 
cv2.destroyAllWindows() 

The end.