# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from pdb import set_trace as t

### Step 1: Coordinate Extraction

In [2]:
# Ines's code takes PDF and yields pdf_word_list, coordinate_map
import os 
# import time
import subprocess
from parseHTMLoutput import extract_coordinates_HTML

pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
filename = 'bc546-d'
pdf_file = pdf_path + filename + '.pdf' # Path to PDF file 
nb_pages = subprocess.check_output("pdfinfo {} | grep Pages  | sed 's/[^0-9]*//'".format(pdf_file), shell=True)

In [3]:
# Create words list and coordinates map
pdf_word_list = []
coordinate_map= {}
for i in range(1,int(nb_pages)+1):
    html_content = subprocess.check_output('pdftotext -f {} -l {} -bbox-layout {} -'.format(str(i), str(i), pdf_file), shell=True)
    pdf_word_list_i, coordinate_map_i = extract_coordinates_HTML(html_content, str(i))
    pdf_word_list += pdf_word_list_i
    coordinate_map.update(coordinate_map_i)

In [4]:
#Sort pdf_word_list based on coordinates (left to right, top to bottom)
sorted_pdf_word_list = sorted(pdf_word_list, key=lambda (word_id,_): (float(coordinate_map[word_id][0]), float(coordinate_map[word_id][1]), float(coordinate_map[word_id][2])), reverse=False)

### Step 2: PDF to HTML Conversion

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

### Step 3: HTML Parsing

In [5]:
# Payal's parser takes HTML and yields corpus object, html_word_list
import os
os.remove('snorkel.db')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
filename = 'bc546-d'
html_file = html_path + filename + '.html'
doc_parser = HTMLParser(path=html_file)
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

%time corpus = cp.parse_corpus(name='Hardware', session=session)

CPU times: user 3.05 s, sys: 164 ms, total: 3.21 s
Wall time: 5.63 s


In [6]:
from pprint import pprint

html_word_list = []
for phrase in corpus.documents[0].phrases:
    for i, word in enumerate(phrase.words):
        html_word_list.append(((phrase.id, i), word))

print len(html_word_list)
pprint(html_word_list[:10])

2662
[((2, 0), u'BC546'),
 ((2, 1), u'-'),
 ((2, 2), u'NPN'),
 ((2, 3), u'Amplifier'),
 ((2, 4), u'Transistors'),
 ((3, 0), u'BC546B'),
 ((3, 1), u','),
 ((3, 2), u'BC547A'),
 ((3, 3), u','),
 ((3, 4), u'B')]


### Step 4: Visual Linking

In [7]:
from visual_linking import link_lists

%time links = link_lists(html_word_list, sorted_pdf_word_list)

CPU times: user 525 ms, sys: 6.4 ms, total: 532 ms
Wall time: 550 ms


  offsets.append(wordsB.index(wordsA[i]) - i)
  if a[1] == b[1]:


### Step 5: Updating with coordinates

In [8]:
# Payal's code walks through phrases, updating each one's five visual attributes
# (page, top, left, bottom, right)
import pickle
from snorkel.models import Phrase

for phrase in corpus.documents[0].phrases:
    (page, top, left, bottom, right) = zip(
        *[coordinate_map[links[((phrase.id), i)]] for i in range(len(phrase.words))])
    page = page[0]
    session.query(Phrase).filter(Phrase.id == phrase.id).update({"page": page, 
                                                             "top":  top, 
                                                             "left": left, 
                                                             "bottom": bottom, 
                                                             "right": right})
session.commit()

In [9]:
print corpus.documents[0].phrases[0].words
print corpus.documents[0].phrases[0].top

[u'BC546', u'-', u'NPN', u'Amplifier', u'Transistors']
('277.176000', '268.296000', '136.652000', '109.020000', '109.020000')


In [10]:
import os
from collections import defaultdict
from snorkel.models import Corpus, candidate_subclass
from snorkel.matchers import RegexMatchSpan, Union
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Candidate Type
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
    
# CandidateSpaces
part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3)
temp_ngrams = OmniNgramsTemp(n_max=2)

# Matchers
eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'1[4-6]0', longest_match_only=False)

# Throttler
part_throttler = None

# Extractor
ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        throttler=part_throttler)

# Extract
for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 1.23 s, sys: 11.7 ms, total: 1.24 s
Wall time: 1.25 s
Candidate Set (Hardware Candidates) contains 540 Candidates


In [13]:
c = candidates[5]
print c.part
print c.part.char_start, c.part.char_end
print c.part.get_word_start()
print c.part.get_word_end()
print c.part.get_attrib_span('top')

print c.part.parent
print c.part.parent.char_offsets
print c.part.parent.top
print c.part.parent.bottom
print c.part.parent.page

ImplicitSpan("BC546B", parent=3, words=[0,2], position=[0])
0 13
0
2
50.100000
Phrase(Doc: bc546-d, Table: X, Row: X, Col: X, Position: 0, Text: BC546B, BC547A, B, C, BC548B, C)
[0, 6, 8, 14, 16, 17, 19, 20, 22, 28, 30]
('50.100000', '70.020600', '50.100000', '70.020600', '70.020600', '70.020600', '70.020600', '70.020600', '70.020600', '70.020600', '70.020600')
('73.500000', '93.420600', '73.500000', '93.420600', '93.420600', '93.420600', '93.420600', '93.420600', '93.420600', '93.420600', '93.420600')
1


### Step 6: Assess Results

In [5]:
# Use Ines's code to plot locations of words on the original PDF
import numpy as np
import cv2
from visual_linking import display_box

In [6]:
page_to_diplay = '2' # page number to visualize

BOUNDING BOXES on PDF image

In [7]:
display_box(pdf_file, [box for box in coordinate_map.values() if box[0]==page_to_diplay])

Display Ordering on a black page - Not very easy to display because of superposition

In [9]:
page_height = 792
page_width = 612
img = np.zeros((page_height,page_width,3))
font = cv2.FONT_HERSHEY_SIMPLEX
i = 0
for word_id, _ in sorted_pdf_word_list:
    if word_id[0] == page_to_diplay:
        i += 1
        __, top, left, bottom, right = coordinate_map[word_id]
        cv2.rectangle(img,(int(float(left)),int(float(top))),(int(float(right)),int(float(bottom))),(0,255,0),1)
        cv2.putText(img, str(i), (int((float(left)+float(right))/2), int(float(bottom))), font, 0.3, (0,0,255), 1)
cv2.imshow('Ordering',img)
cv2.waitKey() # press any key to exit the opencv output 
cv2.destroyAllWindows() 