# PDF PROCESSING#####################################################

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO, StringIO
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import re
import pandas as pd
import operator
import math
from collections import Counter

In [2]:
def convert(fname, case='text', pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    la = LAParams()
    caching = True

    if case == 'text' :
        output = StringIO()
        converter = TextConverter(manager, output, laparams=la)     
    elif case == 'HTML' :
        output = StringIO()
        converter = HTMLConverter(manager, output, laparams=la)
    elif case == 'XML' :
        output = StringIO()
        converter = XMLConverter(manager, output, laparams=la)
    else:
        return

    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()  

    infile.close(); converter.close(); output.close()
    return convertedPDF

In [3]:
def soup(doc):
    soup = BeautifulSoup(doc)
    return soup

In [4]:
def extractInfoFromHTML(soup):
    #extract font size and text from soup object
    font_spans = [data for data in soup.select('span') if 'font-size' in str(data) ]
    output = []
    for i in font_spans:
        tup = ()
        fonts_size = re.search(r'(?is)(font-size:)(.*?)(px)',str(i.get('style'))).group(2)
        fonts_family = re.search(r'(?is)(font-family:)(.*?)(;)',str(i.get('style'))).group(2)
        tup = (str(i.text).strip(),fonts_family.strip(), fonts_size.strip())
        output.append(tup)
    
    #make a dictionary with font size as keys and text as value list
    dictionary = {}
    for index, element in enumerate(output):
        if(element[2] not in dictionary):
            dictionary[element[2]] = [(element[1], element[0].replace("\n", " "))]
        else:
            dictionary[element[2]].append((element[1], element[0].replace("\n", " ")))

    data = pd.DataFrame.from_dict(dictionary, orient='index')
            
    return (dictionary,data,output)

In [43]:
def extractInfoFromXML(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    words_in_textline = ""
    fonts = {}
    point_size = {}
    font = ""
    size = ""
    wList = []
    
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                words_in_textline += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                if text.get('font') in fonts.keys(): 
                    fonts[""+text.get('font')] += 1
                else:
                    fonts[""+text.get('font')] = 1
                
                if text.get('size') in point_size.keys(): 
                    point_size[""+text.get('size')] += 1
                else:
                    point_size[""+text.get('size')] = 1
                    
        font = max(fonts.items(), key=operator.itemgetter(1))[0]
        size = max(point_size.items(), key=operator.itemgetter(1))[0]
        bb = textline.get('bbox').split(',')
        xm = (float(bb[2])+float(bb[0]))/2
        ym = (float(bb[3])+float(bb[1]))/2
        wList.append((size,words_in_textline,bb[0],bb[1],xm,ym,bb[2],bb[3],font))
        
        fonts = {}
        point_size = {}
        words_in_textline = ""
        
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "x0","y0","xm","ym","x1","y1","font_family"]
    data[["x0","y0","xm","ym","x1","y1"]] = data[["x0","y0","xm","ym","x1","y1"]].apply(pd.to_numeric)
    
    return (data)

In [44]:
def wrapStringInHTMLWindows(program, url, output):
    import datetime
    from webbrowser import open_new_tab

    now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")

    filename = program + '.html'
    f = open(filename,'w')

    wrapper = """<html>
    <head>
    <title>%s</title>
    </head>
    <body><p>URL:%s</p></body>
    </html>"""
    
    for ele in output:
        wrapper.append(ele[0],ele[1],ele[2])

    whole = wrapper % (program, now, url)
    f.write(whole)
    f.close()

    open_new_tab(filename)


In [45]:
def segmentByFontAndSize(data):
    #Phase two, geometric segmentation of text regions
    dfs = dict(tuple(data.groupby([data['font_size'],data['font_family']])))
    df_list = []
    for key in dfs.keys():
        df_list.append(pd.DataFrame(dfs[key]))
    return df_list

In [46]:
def make_sub_bins(df_list):
    sub_bins_list = []
    for df in df_list:
        #apply counter at each coordinate
        x0 = Counter(list(df['x0'])).most_common()
        x1 = Counter(list(df['x1'])).most_common()

        #select all line with x0 having max count
        most_common_x0 = x0[0][0] 
        dfx0 = df.loc[df['x0'] == most_common_x0]
        #select the longest line from above lines
        longest_common_line = dfx0.sort_values(by ='x1', axis=0, ascending = False).iloc[0]
        x0_line = float(longest_common_line['x0'])
        x1_line = float(longest_common_line['x1'])
        #create subbins of lines on the left and right side of this line
        left_sub_bin = df.loc[df['x1'].astype(float) < x0_line]
        right_sub_bin = df.loc[df['x0'].astype(float) > x1_line]
        #remove left and right subbin from main dataframe to find the mioddle subbin 
        temp = df.loc[~df.set_index(list(df.columns)).index.isin(right_sub_bin.set_index(list(right_sub_bin.columns)).index)]
        middle_sub_bin = temp.loc[~temp.set_index(list(temp.columns)).index.isin(left_sub_bin.set_index(list(left_sub_bin.columns)).index)]
        sub_bins_list.append((left_sub_bin,middle_sub_bin,right_sub_bin))
    
    return sub_bins_list

In [47]:
doc = convert('./data/sample.pdf','XML')
s = soup(doc)
data = extractInfoFromXML(s)
df_list = segmentByFontAndSize(data)
sub_bins_list = make_sub_bins(df_list)

In [84]:
df_list[1]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family
24,11.574,Android Workshop (02/2019),38.85,426.465,110.5945,432.2515,182.339,438.038,QEBAAA+Ubuntu-Regular
27,11.574,Co-Supervisor of Final Year Students (Mobile,38.85,388.986,140.198,394.773,241.546,400.56,QEBAAA+Ubuntu-Regular
28,11.574,Application to Detect Drowsiness and Alert Dri...,38.85,378.017,153.747,383.8035,268.644,389.59,QEBAAA+Ubuntu-Regular
29,11.574,(01/2019),38.85,367.048,61.038,372.8345,83.226,378.621,QEBAAA+Ubuntu-Regular
39,11.574,Android Application Development (07/2017),38.85,267.181,145.588,272.968,252.326,278.755,QEBAAA+Ubuntu-Regular
71,11.574,Package Caliber Assessment (2017 – 2018),309.426,503.25,405.866,509.037,502.306,514.824,QEBAAA+Ubuntu-Regular
79,11.574,Picture Quotes Text Photo Editor,309.426,400.184,390.5855,405.971,471.745,411.758,QEBAAA+Ubuntu-Regular
83,11.574,GPS: Proﬁling & Tracking,309.426,352.879,371.212,358.6655,432.998,364.452,QEBAAA+Ubuntu-Regular
88,11.574,Movie Suggestion Application,309.426,296.661,377.1225,302.448,444.819,308.235,QEBAAA+Ubuntu-Regular
91,11.574,"Muslim Kit (Prayer Time, Tasbeeh ,Zakat Calcul...",309.426,258.269,425.3595,264.0555,541.293,269.842,QEBAAA+Ubuntu-Regular


In [107]:
#calculate the threshold of line spacing
#-1 for empty dataframe and 
#group lines with spacing less then the threshold
thresh_list = []
for sub_bin_tuple in sub_bins_list:
    thresh_tuple = []
    for df in sub_bin_tuple:
        if len(df) > 1:
            thresh = 0
            df = df.sort_values(by ='y0', axis=0, ascending = False)
            df['diff_y0'] = (df['y0'] - df['y0'].shift(-1))
            th_list = Counter(list(map(int,(list(df['diff_y0'][:-1]))))).most_common()
            temp = [i[0] for i in th_list]
            if(len(set([i[1] for i in th_list]))==1):
                temp.sort()
                thresh = temp[0]
            else:
                if th_list[0][0] != 0:
                    thresh = th_list[0][0]
                else:
                    thresh = th_list[1][0]
            thresh_tuple.append(thresh)
        elif len(df) == 1:
            thresh_tuple.append(-2)
    thresh_list.append(thresh_tuple)

IndexError: list index out of range

In [149]:
df = sub_bins_list[5][1]
if len(df) > 1:
    thresh = 0
    df = df.sort_values(by ='y0', axis=0, ascending = False)
    df['diff_y0'] = (df['y0'] - df['y0'].shift(-1))
    th_list = Counter(list(map(int,(list(df['diff_y0'][:-1]))))).most_common()
    temp = [i[0] for i in th_list]
    if(len(set([i[1] for i in th_list]))==1):
        temp.sort()
        thresh = temp[0]
    else:
        if th_list[0][0] != 0:
            thresh = th_list[0][0]
        else:
            thresh = th_list[1][0]
    print(thresh)

In [147]:
sub_bins_list[5][1]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family
0,26.567,Waleed Mushtaq,37.936,795.132,125.6615,808.415,213.387,821.698,QEBAAA+Ubuntu-Regular


In [97]:
ar = [1,2.1,3,0]
ar.sort()
ar

[0, 1, 2.1, 3]

In [76]:
df['diff_y0'][:-1]

2       10.970
3       10.969
4      147.857
14     102.381
21     334.107
98       0.000
103     12.797
99       0.000
104     12.798
105      0.000
100     12.798
106      0.000
101     10.969
107     12.797
102      0.000
108     10.970
Name: diff_y0, dtype: float64

In [73]:
list(map(int, [1.0,2.05]))

ValueError: invalid literal for int() with base 10: 'd'

In [32]:
df = sub_bins_list[0][1]
# convert just columns "a" and "b"
df[["x0", "y0"]] = df[["x0", "y0"]].apply(pd.to_numeric)
df.iloc[2]['x0'].type()
#df.sort_values(by ='y0', axis=0, ascending = False)

AttributeError: 'numpy.float64' object has no attribute 'type'

In [13]:
#phase3 algorithm sudo code

In [14]:
#step0: sort the bin on the basis of x1 in desc order and set split == true
#step1: if len(bin) > 1 select the bounding box with max x1 say it maxb else go to step 6
#step2: for b in bin if x1 of b is less then the x0 of maxb then move it into left sub-bin 
#       and set split == true else split == false
#step3: if split == false, throw maxb into temp sub-bin and go to step 1 else throw bin in 
#       right sub-bin and repeat step 1 to 3 on right and left sub-bins respectively as bin
#step5: throw b in new sub-bin and stop the algorithm 

In [23]:
s

<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page bbox="0.000,0.000,595.000,842.000" id="1" rotate="0">
<textbox bbox="37.936,779.837,213.387,821.698" id="0">
<textline bbox="37.936,795.132,213.387,821.698">
<text bbox="37.936,795.132,59.217,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">W</text>
<text bbox="58.692,795.132,70.648,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">a</text>
<text bbox="70.740,795.132,76.995,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">l</text>
<text bbox="77.039,795.132,89.849,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">e</text>
<text bbox="89.940,795.132,102.750,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">e</text>
<text bbox="102.841,795.132,116.320,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">d</text>
<text bbox="116.435,795.132,121.720,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567"> </text>
<text bbox="121.766,795.132,141.708,821.698" font="QEBAAA+Ubuntu-Regular" size="26.567">M</text>
<text bbox="141.869,7