# PDF PROCESSING#####################################################

In [53]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter,TextConverter,XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO, StringIO
from bs4 import BeautifulSoup
from bs4 import NavigableString
from bs4 import Tag
import re
import pandas as pd
import operator
import math

In [2]:
def convert(fname, case='text', pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    la = LAParams()
    caching = True

    if case == 'text' :
        output = StringIO()
        converter = TextConverter(manager, output, laparams=la)     
    elif case == 'HTML' :
        output = StringIO()
        converter = HTMLConverter(manager, output, laparams=la)
    elif case == 'XML' :
        output = StringIO()
        converter = XMLConverter(manager, output, laparams=la)
    else:
        return

    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()  

    infile.close(); converter.close(); output.close()
    return convertedPDF

In [3]:
def soup(doc):
    soup = BeautifulSoup(doc)
    return soup

In [4]:
def extractInfoFromHTML(soup):
    #extract font size and text from soup object
    font_spans = [data for data in soup.select('span') if 'font-size' in str(data) ]
    output = []
    for i in font_spans:
        tup = ()
        fonts_size = re.search(r'(?is)(font-size:)(.*?)(px)',str(i.get('style'))).group(2)
        fonts_family = re.search(r'(?is)(font-family:)(.*?)(;)',str(i.get('style'))).group(2)
        tup = (str(i.text).strip(),fonts_family.strip(), fonts_size.strip())
        output.append(tup)
    
    #make a dictionary with font size as keys and text as value list
    dictionary = {}
    for index, element in enumerate(output):
        if(element[2] not in dictionary):
            dictionary[element[2]] = [(element[1], element[0].replace("\n", " "))]
        else:
            dictionary[element[2]].append((element[1], element[0].replace("\n", " ")))

    data = pd.DataFrame.from_dict(dictionary, orient='index')
            
    return (dictionary,data,output)

In [5]:
def extractInfoFromXML(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    word = ""
    font = ""
    size = ""
    wList = []
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                word += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                font = text.get('font')
                size = text .get('size')
        wList.append((size,word,textline.get('bbox'),font))

        word = ""
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "bbox", "font_family"]
    return data

In [44]:
def extractInfoFromXMLnew(soup):
    #bottom left and top right coordinates in xml bbox 
    wordList = soup.find_all('textline')
    words_in_textline = ""
    fonts = {}
    point_size = {}
    font = ""
    size = ""
    wList = []
    
    for textline in wordList:
        for text in textline:
            if isinstance(text, NavigableString):
                continue
            if isinstance(text, Tag) and text.get('bbox') != None:
                words_in_textline += text.text
                #Point size of the majority of its constituent words.
                #Font of the majority of its constituent words.
                #Coordinates relating to its position on the page.
                if text.get('font') in fonts.keys(): 
                    fonts[""+text.get('font')] += 1
                else:
                    fonts[""+text.get('font')] = 1
                
                if text.get('size') in point_size.keys(): 
                    point_size[""+text.get('size')] += 1
                else:
                    point_size[""+text.get('size')] = 1
                    
        font = max(fonts.items(), key=operator.itemgetter(1))[0]
        size = max(point_size.items(), key=operator.itemgetter(1))[0]
        bb = textline.get('bbox').split(',')
        xm = float(bb[2])-float(bb[0])
        ym = float(bb[3])-float(bb[1])
        wList.append((size,words_in_textline,bb[0],bb[1],xm,ym,bb[2],bb[3],font))
        
        fonts = {}
        point_size = {}
        words_in_textline = ""
        
    data = pd.DataFrame(wList)
    data.columns = ["font_size", "text", "x0","y0","xm","ym","x1","y1","font_family"]
    
    return (data)

In [7]:
def wrapStringInHTMLWindows(program, url, output):
    import datetime
    from webbrowser import open_new_tab

    now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")

    filename = program + '.html'
    f = open(filename,'w')

    wrapper = """<html>
    <head>
    <title>%s</title>
    </head>
    <body><p>URL:%s</p></body>
    </html>"""
    
    for ele in output:
        wrapper.append(ele[0],ele[1],ele[2])

    whole = wrapper % (program, now, url)
    f.write(whole)
    f.close()

    open_new_tab(filename)


In [8]:
def segmentByFontAndSize(data):
    #Phase two, geometric segmentation of text regions
    dfs = dict(tuple(data.groupby([data['font_size'],data['font_family']])))
    df_list = []
    for key in dfs.keys():
        df_list.append(pd.DataFrame(dfs[key]))
    return df_list

In [45]:
doc = convert('./data/sample.pdf','XML')
s = soup(doc)
data = extractInfoFromXMLnew(s)
df_list = segmentByFontAndSize(data)

In [50]:
df_list[2]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family
1,13.941,Software Engineer,38.85,779.837,102.156,13.94,141.006,793.777,QEBAAA+Ubuntu-Regular
7,13.941,Khas Tech Solutions,38.85,666.944,122.54,13.941,161.39,680.885,QEBAAA+Ubuntu-Regular
16,13.941,Murree Brewery,38.85,546.739,103.472,13.94,142.322,560.679,QEBAAA+Ubuntu-Regular
43,13.941,Foundation University Islamabad,38.85,192.978,192.947,13.941,231.797,206.919,QEBAAA+Ubuntu-Regular


In [11]:
#courses<textline bbox="38.850,167.703,65.938,176.918">
#Software Construction<textline bbox="47.762,155.647,138.912,165.905">
#smwm674@gmail.com<textline bbox="471.224,809.866,553.015,819.072">

In [57]:
threshold = math.ceil(len(df_list[0])/4)

In [58]:
threshold

11

In [61]:
df_list[4]

Unnamed: 0,font_size,text,x0,y0,xm,ym,x1,y1,font_family
5,16.491,WORK EXPERIENCE,38.85,697.45,131.758,16.491,170.608,713.941,QZAAAA+Ubuntu-Bold
23,16.491,CERTIFICATES,38.85,441.27,95.344,16.492,134.194,457.762,QZAAAA+Ubuntu-Bold
41,16.491,EDUCATION,38.85,223.484,81.395,16.491,120.245,239.975,QZAAAA+Ubuntu-Bold
50,16.491,SKILLS,309.426,697.45,46.011,16.491,355.437,713.941,QZAAAA+Ubuntu-Bold
70,16.491,SEMESTER PROJECTS,309.426,518.056,145.535,16.491,454.961,534.547,QZAAAA+Ubuntu-Bold
78,16.491,PORTFOLIO (ANDROID APPS),309.426,414.99,198.177,16.491,507.603,431.481,QZAAAA+Ubuntu-Bold
97,16.491,LANGUAGES,309.426,173.208,84.161,16.491,393.587,189.699,QZAAAA+Ubuntu-Bold
112,16.491,INTERESTS,309.426,100.764,74.17,16.492,383.596,117.256,QZAAAA+Ubuntu-Bold
