# Adding Visual Features

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.remove('snorkel.db')

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from pdb import set_trace as t

### Convert PDF to HTML

Use Adobe Acrobat (or other program of your choice) to convert PDF -> HTML with structure.

### Parse HTML and PDF

In [2]:
import os
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'

filename = 'bc546-d'
# filename = 'bc550'
# filename = 'VISHS23888-1' 
# filename = 'BC546-BC548C(TO-92)'
# filename = 'INFNS19372-1'
# filename = 'PHGLS18216-1'
html_file = html_path + filename + '.html'
# html_file=html_path

doc_parser = HTMLParser(path=html_file)
context_parser = OmniParser(visual=True, pdf_path=pdf_path, session=session)
cp = CorpusParser(doc_parser, context_parser, max_docs=1) 

%time corpus = cp.parse_corpus(name='Hardware', session=session)

# # Save results
# os.system('cp snorkel.db snorkel.db\ corpus');

CPU times: user 4.6 s, sys: 122 ms, total: 4.72 s
Wall time: 6.88 s


In [3]:
print corpus.documents[0].phrases[0].left

[59, 99, 59, 59, 155]


### Load results

In [None]:
import os
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance

os.system('cp snorkel.db\ corpus snorkel.db');
corpus = get_ORM_instance(Corpus, session, 'Hardware')

### Extract Candidates

In [4]:
import os
from collections import defaultdict
from snorkel.models import Corpus, candidate_subclass
from snorkel.matchers import RegexMatchSpan, Union
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from hardware_utils import OmniNgramsPart, OmniNgramsTemp, get_gold_dict

# Candidate Type
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
    
# CandidateSpaces
part_ngrams = OmniNgramsPart(parts_by_doc=None, n_max=3) # NOTE: no part linking right now
temp_ngrams = OmniNgramsTemp(n_max=2)

# Matchers
eeca_matcher = RegexMatchSpan(rgx='([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z\/]{0,5}[0-9]?[A-Z]?([-][A-Z0-9]{1,7})?([-][A-Z0-9]{1,2})?)')
jedec_matcher = RegexMatchSpan(rgx='([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)')
jis_matcher = RegexMatchSpan(rgx='(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})')
others_matcher = RegexMatchSpan(rgx='((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)')
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'\d+[05]', longest_match_only=False)

# Throttler
part_throttler = lambda x: x[0].get_attrib_tokens('page')[0] == x[1].get_attrib_tokens('page')[0]
# part_throttler = None

# Extractor
ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher], 
                        throttler=part_throttler)

# Extract
for corpus_name in ['Hardware']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware)
CPU times: user 5.46 s, sys: 23 ms, total: 5.49 s
Wall time: 5.5 s
Candidate Set (Hardware Candidates) contains 5367 Candidates


In [None]:
# # %%time
# from snorkel.lf_helpers import *

# yes = 0
# no = 0
# subset = []
# # d = (candidates[0][0], candidates[1][0])
# # print bbox_vert_aligned(bbox_from_span(d[0]), bbox_from_span(d[1]))
# for c in candidates:
#     if is_vert_aligned_center(c):
#         yes += 1
#         subset.append(c)
#     else:
#         no += 1
# print yes, no

In [5]:
from snorkel.lf_helpers import *

c = candidates[100]
print c
print c.part.parent
print c.part.is_visual()
print list(get_horz_ngrams(c.part, attrib='pos_tags'))

Part_Temp(Span("BC547AZL1G", parent=1488, chars=[0,9], words=[0,0]), ImplicitSpan("5000", parent=1543, words=[0,0], position=[0]))
Phrase (Doc: bc546-d, Table: 6, Row: 8, Col: 0, Index: 0, Text: BC547AZL1G)
True
[u'to', u'nn', u'cd', u'cd', u':', u'nnp', u'nnp']


### View Results

In [None]:
# c = candidates[15]
# print c.part
# print c.part.char_start, c.part.char_end
# print c.part.parent.page
# print c.part.get_attrib_tokens('top')
# print c.part.get_attrib_tokens('bottom')
# print c.part.get_attrib_tokens('left')
# print c.part.get_attrib_tokens('right')


# from snorkel.entity_features import visual_binary_features
# from snorkel.lf_helpers import get_aligned_lemmas, _bbox_from_span

# print c.part.parent.document
# for c in candidates:
    
#     span1, span2 = c.get_arguments()
    
#     feats = set()
#     for f in visual_binary_features(span1, span2):
#         feats.add(f)
#     text1 =   span1.get_span()
#     text2 = span2.get_span()
#     if not  'Y_ALIGNED' in feats or span1.parent.page!=2: continue
#     print '='*20
#     print 'For candidate pair:'
#     print span1.get_span()
#     print span2.get_span()
#     print 'Visual features are:'
#     for f in feats: print f
#     print 'LF is_aligned_with_lemmas:', 'min' in get_aligned_lemmas(span2)
#     print 'Phrase1', span1.parent,  span1.parent.page#.text, span1.parent.bbox
#     print 'Phrase2', span2.parent,  span2.parent.page#.text, span2.parent.bbox, span2.parent.page
#     print _bbox_from_span(span1), _bbox_from_span(span2)
    
# print len(candidates)

In [None]:
# context_parser.vizlink.display_links(10)

In [None]:
# context_parser.vizlink.display_words(page_num=2, display=True)

In [7]:
context_parser.vizlink.display_words(target='BC546', page_num=2)

Boxes per page: total (unique)
Page 1: 3 (3)
Page 2: 8 (8)
Page 4: 1 (1)


In [6]:
context_parser.vizlink.display_candidates(candidates, page_num=2)

Boxes per page: total (unique)
Page 1: 750 (29)
Page 2: 8928 (108)
Page 3: 288 (50)
Page 4: 84 (42)
Page 5: 648 (36)
Page 6: 36 (7)


In [None]:
# from visual_linking import display_boxes, get_box

# boxes = []
# for c in candidates:
#     boxes.append(get_box(c.part))
# boxes = list(set(boxes))
# display_boxes(pdf_file, boxes, page_num=2)

### Display Ordering of PDF Word List 

Display Ordering on a black page - Not very easy to display because of superposition

In [None]:
# import numpy as np
# import cv2
# import math

# page_num = 2
# page_height = 792
# page_width = 612
# img = np.ones((page_height,page_width,3))*255
# font = cv2.FONT_HERSHEY_SIMPLEX
# letter_width = 3
# i = 0
# for word_id, _ in pdf_word_list:
#     if word_id[0] == page_num:
#         i += 1
#         _, top, left, bottom, right = coordinate_map[word_id]
#         cv2.rectangle(img, (left, top), (right, bottom), (0,255,0), 1)
#         cv2.putText(img, 
#                     str(i), 
#                     ((left + right)/2 - letter_width*int(math.ceil(math.log10(i))), 
#                     bottom + (top - bottom)/4), 
#                     font, 
#                     0.3, 
#                     (255,0,0), 
#                     1)
# cv2.imshow('PDF Word List Order',img)
# cv2.waitKey() # press any key to exit the opencv output 
# cv2.destroyAllWindows() 

# The end.