# PDF PROCESSING#####################################################

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO, StringIO
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import re
import pandas as pd
import operator
import math
from collections import Counter

In [2]:
def convert(fname, case='text', pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    la = LAParams()
    caching = True

    if case == 'text' :
        output = StringIO()
        converter = TextConverter(manager, output, laparams=la)     
    elif case == 'HTML' :
        output = StringIO()
        converter = HTMLConverter(manager, output, laparams=la)
    elif case == 'XML' :
        output = StringIO()
        converter = XMLConverter(manager, output, laparams=la)
    else:
        return

    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()  

    infile.close(); converter.close(); output.close()
    return convertedPDF

In [3]:
def soup(doc):
    soup = BeautifulSoup(doc)
    return soup

In [4]:
def extractInfoFromHTML(soup):
    #extract font size and text from soup object
    font_spans = [data for data in soup.select('span') if 'font-size' in str(data) ]
    output = []
    for i in font_spans:
        tup = ()
        fonts_size = re.search(r'(?is)(font-size:)(.*?)(px)',str(i.get('style'))).group(2)
        fonts_family = re.search(r'(?is)(font-family:)(.*?)(;)',str(i.get('style'))).group(2)
        tup = (str(i.text).strip(),fonts_family.strip(), fonts_size.strip())
        output.append(tup)
    
    #make a dictionary with font size as keys and text as value list
    dictionary = {}
    for index, element in enumerate(output):
        if(element[2] not in dictionary):
            dictionary[element[2]] = [(element[1], element[0].replace("\n", " "))]
        else:
            dictionary[element[2]].append((element[1], element[0].replace("\n", " ")))

    data = pd.DataFrame.from_dict(dictionary, orient='index')
            
    return (dictionary,data,output)

In [5]:
def extractInfoFromXML(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    word = ""
    font = ""
    size = ""
    wList = []
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                word += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                font = text.get('font')
                size = text .get('size')
        wList.append((size,word,textline.get('bbox'),font))

        word = ""
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "bbox", "font_family"]
    return data

In [6]:
def extractInfoFromXMLnew(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    words_in_textline = ""
    fonts = {}
    point_size = {}
    font = ""
    size = ""
    wList = []
    
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                words_in_textline += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                if text.get('font') in fonts.keys(): 
                    fonts[""+text.get('font')] += 1
                else:
                    fonts[""+text.get('font')] = 1
                
                if text.get('size') in point_size.keys(): 
                    point_size[""+text.get('size')] += 1
                else:
                    point_size[""+text.get('size')] = 1
                    
        font = max(fonts.items(), key=operator.itemgetter(1))[0]
        size = max(point_size.items(), key=operator.itemgetter(1))[0]
        bb = textline.get('bbox').split(',')
        xm = (float(bb[2])+float(bb[0]))/2
        ym = (float(bb[3])+float(bb[1]))/2
        wList.append((size,words_in_textline,bb[0],bb[1],xm,ym,bb[2],bb[3],font))
        
        fonts = {}
        point_size = {}
        words_in_textline = ""
        
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "x0","y0","xm","ym","x1","y1","font_family"]
    
    return (data)

In [7]:
def wrapStringInHTMLWindows(program, url, output):
    import datetime
    from webbrowser import open_new_tab

    now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")

    filename = program + '.html'
    f = open(filename,'w')

    wrapper = """<html>
    <head>
    <title>%s</title>
    </head>
    <body><p>URL:%s</p></body>
    </html>"""
    
    for ele in output:
        wrapper.append(ele[0],ele[1],ele[2])

    whole = wrapper % (program, now, url)
    f.write(whole)
    f.close()

    open_new_tab(filename)


In [8]:
def segmentByFontAndSize(data):
    #Phase two, geometric segmentation of text regions
    dfs = dict(tuple(data.groupby([data['font_size'],data['font_family']])))
    df_list = []
    for key in dfs.keys():
        df_list.append(pd.DataFrame(dfs[key]))
    return df_list

In [None]:
def make_sub_bins(df):
    #apply counter at each coordinate
    x0 = Counter(list(df['x0'])).most_common()
    y0 = Counter(list(df['y0'])).most_common()
    xm = Counter(list(df['xm'])).most_common()
    ym = Counter(list(df['ym'])).most_common()
    x1 = Counter(list(df['x1'])).most_common()
    y1 = Counter(list(df['x0'])).most_common()

    #select all line with x0 having max count
    most_common_x0 = x0[0][0] 
    dfx0 = df.loc[df['x0'] == most_common_x0]
    #select the longest line from above lines
    longest_common_line = dfx0.sort_values(by ='x1', axis=0, ascending = False).iloc[0]
    x0_line = float(longest_common_line['x0'])
    x1_line = float(longest_common_line['x1'])
    #create subbins of lines on the left and right side of this line
    left_sub_bin = df.loc[df['x1'].astype(float) < x0_line]
    right_sub_bin = df.loc[df['x0'].astype(float) > x1_line]
    #remove left and right subbin from main dataframe to find the mioddle subbin 
    temp = df.loc[~df.set_index(list(df.columns)).index.isin(right_sub_bin.set_index(list(right_sub_bin.columns)).index)]
    middle_sub_bin = temp.loc[~temp.set_index(list(temp.columns)).index.isin(left_sub_bin.set_index(list(left_sub_bin.columns)).index)]
    
    return(left_sub_bin,middle_sub_bin,right_sub_bin)

In [115]:
doc = convert('./data/sample.pdf','XML')
s = soup(doc)
data = extractInfoFromXMLnew(s)
df_list = segmentByFontAndSize(data)

In [125]:
sub_bins = make_sub_bins(df_list[4])

In [130]:
sub_bins[2]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family


In [None]:
#phase3 algorithm sudo code

In [None]:
#step0: sort the bin on the basis of x1 in desc order and set split == true
#step1: if len(bin) > 1 select the bounding box with max x1 say it maxb else go to step 6
#step2: for b in bin if x1 of b is less then the x0 of maxb then move it into left sub-bin 
#       and set split == true else split == false
#step3: if split == false, throw maxb into temp sub-bin and go to step 1 else throw bin in 
#       right sub-bin and repeat step 1 to 3 on right and left sub-bins respectively as bin
#step5: throw b in new sub-bin and stop the algorithm 