# PDF PROCESSING#####################################################

In [5]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO, StringIO
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import re
import pandas as pd
import operator
import math

In [6]:
def convert(fname, case='text', pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    la = LAParams()
    caching = True

    if case == 'text' :
        output = StringIO()
        converter = TextConverter(manager, output, laparams=la)     
    elif case == 'HTML' :
        output = StringIO()
        converter = HTMLConverter(manager, output, laparams=la)
    elif case == 'XML' :
        output = StringIO()
        converter = XMLConverter(manager, output, laparams=la)
    else:
        return

    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()  

    infile.close(); converter.close(); output.close()
    return convertedPDF

In [7]:
def soup(doc):
    soup = BeautifulSoup(doc)
    return soup

In [8]:
def extractInfoFromHTML(soup):
    #extract font size and text from soup object
    font_spans = [data for data in soup.select('span') if 'font-size' in str(data) ]
    output = []
    for i in font_spans:
        tup = ()
        fonts_size = re.search(r'(?is)(font-size:)(.*?)(px)',str(i.get('style'))).group(2)
        fonts_family = re.search(r'(?is)(font-family:)(.*?)(;)',str(i.get('style'))).group(2)
        tup = (str(i.text).strip(),fonts_family.strip(), fonts_size.strip())
        output.append(tup)
    
    #make a dictionary with font size as keys and text as value list
    dictionary = {}
    for index, element in enumerate(output):
        if(element[2] not in dictionary):
            dictionary[element[2]] = [(element[1], element[0].replace("\n", " "))]
        else:
            dictionary[element[2]].append((element[1], element[0].replace("\n", " ")))

    data = pd.DataFrame.from_dict(dictionary, orient='index')
            
    return (dictionary,data,output)

In [9]:
def extractInfoFromXML(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    word = ""
    font = ""
    size = ""
    wList = []
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                word += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                font = text.get('font')
                size = text .get('size')
        wList.append((size,word,textline.get('bbox'),font))

        word = ""
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "bbox", "font_family"]
    return data

In [10]:
def extractInfoFromXMLnew(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    words_in_textline = ""
    fonts = {}
    point_size = {}
    font = ""
    size = ""
    wList = []
    
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                words_in_textline += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                if text.get('font') in fonts.keys(): 
                    fonts[""+text.get('font')] += 1
                else:
                    fonts[""+text.get('font')] = 1
                
                if text.get('size') in point_size.keys(): 
                    point_size[""+text.get('size')] += 1
                else:
                    point_size[""+text.get('size')] = 1
                    
        font = max(fonts.items(), key=operator.itemgetter(1))[0]
        size = max(point_size.items(), key=operator.itemgetter(1))[0]
        bb = textline.get('bbox').split(',')
        xm = (float(bb[2])+float(bb[0]))/2
        ym = (float(bb[3])+float(bb[1]))/2
        wList.append((size,words_in_textline,bb[0],bb[1],xm,ym,bb[2],bb[3],font))
        
        fonts = {}
        point_size = {}
        words_in_textline = ""
        
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "x0","y0","xm","ym","x1","y1","font_family"]
    
    return (data)

In [11]:
def wrapStringInHTMLWindows(program, url, output):
    import datetime
    from webbrowser import open_new_tab

    now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")

    filename = program + '.html'
    f = open(filename,'w')

    wrapper = """<html>
    <head>
    <title>%s</title>
    </head>
    <body><p>URL:%s</p></body>
    </html>"""
    
    for ele in output:
        wrapper.append(ele[0],ele[1],ele[2])

    whole = wrapper % (program, now, url)
    f.write(whole)
    f.close()

    open_new_tab(filename)


In [12]:
def segmentByFontAndSize(data):
    #Phase two, geometric segmentation of text regions
    dfs = dict(tuple(data.groupby([data['font_size'],data['font_family']])))
    df_list = []
    for key in dfs.keys():
        df_list.append(pd.DataFrame(dfs[key]))
    return df_list

In [13]:
doc = convert('./data/sample.pdf','XML')
s = soup(doc)
data = extractInfoFromXMLnew(s)
df_list = segmentByFontAndSize(data)
df_list[0]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family
2,10.258,To work in a highly professional and competitive,38.85,761.931,135.919,767.06,232.988,772.189,QEBAAA+Ubuntu-Regular
3,10.258,environment where i can learn and gain practical,38.85,750.961,136.362,756.0905,233.874,761.22,QEBAAA+Ubuntu-Regular
4,10.258,experience that will help me improve my skills.,38.85,739.992,132.371,745.121,225.892,750.25,QEBAAA+Ubuntu-Regular
14,10.258,Android Application Development Engineer,47.762,592.135,135.59,597.264,223.418,602.393,QEBAAA+Ubuntu-Regular
21,10.258,Information Technology Department,47.762,489.754,121.903,494.8835,196.044,500.013,QEBAAA+Ubuntu-Regular
51,10.258,Android Development (SDK),316.282,678.518,373.103,683.647,429.924,688.776,QEBAAA+Ubuntu-Regular
52,10.258,Android UI/UX,448.371,678.518,477.461,683.647,506.551,688.776,QEBAAA+Ubuntu-Regular
53,10.258,Android API Integration,316.282,654.751,364.1705,659.8805,412.059,665.01,QEBAAA+Ubuntu-Regular
54,10.258,Android Git,430.546,654.751,453.7055,659.8805,476.865,665.01,QEBAAA+Ubuntu-Regular
55,10.258,JSON,495.448,654.751,506.728,659.8805,518.008,665.01,QEBAAA+Ubuntu-Regular


In [14]:
#courses<textline bbox="38.850,167.703,65.938,176.918">
#Software Construction<textline bbox="47.762,155.647,138.912,165.905">
#smwm674@gmail.com<textline bbox="471.224,809.866,553.015,819.072">

In [15]:
#phase3 algorithm sudo code

In [18]:
#step0: sort the bin on the basis of x1 in desc order and set split == true
#step1: if len(bin) > 1 select the bounding box with max x1 say it maxb else go to step 6
#step2: for b in bin if x1 of b is less then the x0 of maxb then move it into left sub-bin 
#       and set split == true else split == false
#step3: if split == false, throw maxb into temp sub-bin and repeat step 1 to 3 on bin else throw dataframe in 
#       right sub-bin and repeat step 1 to 3 on right and left sub-bins respectively as bin
#step5: throw b in new sub-bin and stop the algorithm 