In [1]:
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine

import re
import nltk

In [2]:
def readPDF(fp):
    file = open(fp , 'rb')
    parser = PDFParser(file)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    #I changed the following 2 parameters to get rid of white spaces inside words:
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                #extracted_text += string
                extracted_text += lt_obj.get_text()

    #with open('convertedFile.txt',"wb") as txt_file:
    #    txt_file.write(extracted_text.encode("utf-8"))
    return extracted_text

In [3]:
ext_data  = readPDF("TejasSresume.pdf")
ext_data.split('\n')



['Tejas Shahasane                                                                                                          ',
 'Email         :  tejas.shahasane92@gmail.com ',
 'Mobile      :  +91-8097058997/9082944018 ',
 'SUMMARY ',
 'ISTQB Certified Tester with 4+ years’ of experience in software testing. Good understanding of Software testing  ',
 'Life cycle, Defect life cycle, Planning activities, Execution and ensuring client requirements are met with excellent ',
 'solutions.  ',
 'EDUCATION  ',
 'Bachelor in Information Technology, 2013 (MUMBAI UNIVERSITY) ',
 'CERTIFICATION ',
 'ISTQB Foundation Level Certification (December 2015) ',
 'TECHNOLOGY SKILLS  ',
 'Defect Management    :   ALM, HP 10 Quality Center, JIRA, Mantis ',
 'Database                        :   MS SQL  ',
 'Operating System         :   Windows  ',
 'EXPERIENCE ',
 '1. Organization :  Capgemini – Mumbai (Airoli) ',
 'Client:  Guidewire  ',
 'Role:  Associate Consultant ',
 'Project: GuideOne Insurance (Oct 2

In [4]:
def tagWords(data):
    lines = [el.strip() for el in data.split('\n') if len(el) > 0]
    lines =[nltk.word_tokenize(el) for el in lines]
    lines = [nltk.pos_tag(el) for el in lines]
    return lines

In [5]:
def expExtract(dataLines):
    n = 0
    k = 0
    experiance = 0
    for sentences in dataLines:
        n+=1
        sen=" ".join([words[0].lower() for words in sentences])

        if re.search('experience', sen):
            #selected only the lines which have experiance word in it
            k +=1
            #print(sen)
            sent_tockenised = nltk.word_tokenize(sen)
            tagged = nltk.pos_tag(sent_tockenised)
            #print(tagged)
            entities = nltk.chunk.ne_chunk(tagged)
            #print(entities)
            #for subtree in entities.subtree():
            for leaf in entities.leaves():
                if leaf[1]=='CD':
                    exValue = re.sub('[^0-9]','', leaf[0])
                    if exValue != "":
                        experiance = exValue
    return experiance

In [6]:
ext_data  = readPDF("TejasSresume.pdf")
#ext_data.split('\n')
taggedLine = tagWords(ext_data)
#print(taggedLine)
#print("total years of experiance is :",expExtract(taggedLine))



In [7]:
resumes = ["Nilesh Resume.pdf" , "Pallavi CV.pdf", "TejasSresume.pdf", "RahulDS.pdf"]
for resume in resumes:
    resumeData = readPDF(resume)
    dataLines = tagWords(resumeData)
    exp = expExtract(dataLines)
    print(f"Resume name: '{resume}' Experiance '{exp}'")

Resume name: 'Nilesh Resume.pdf' Experiance '2'




Resume name: 'Pallavi CV.pdf' Experiance '2'




Resume name: 'TejasSresume.pdf' Experiance '4'
Resume name: 'RahulDS.pdf' Experiance '3'
