In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

import os

In [2]:
def get_pdf_file_content(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
        
    out_text = StringIO()
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    
    codec = 'utf-8'    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    
    laParams = LAParams()
    '''
    LAParams is the object containing the Layout parameters with a certain default value. 
    '''
    
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''

    interpreter = PDFPageInterpreter(resource_manager, text_converter)    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''

    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)
    '''
    We are going to process the content of each page of the original PDF File
    '''

    text = out_text.getvalue()
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''

    fp.close()
    text_converter.close()
    out_text.close()
    '''
    Closing all the ressources we previously opened
    '''

    return text
    '''
    Return the final variable containing all the text of the PDF
    '''
    

Extract the pdf content:

In [5]:
# Print Single File
path_to_pdf = "./targeted_resume/csv-converter/original_resumes/pdf/1.pdf"
print(get_pdf_file_content(path_to_pdf))

CURRICULUM VITAE

Personal 
Information

22859930

0717 550926

: Mike KisasatiWanaswa

: P.O. Box 85575 80100Mombasa

FullNames
IDCardNo.
PostalAddress
TelephoneNo.
EmailAddress
mLanguages
SwahiliPurpose
To put in use the latest inventions in Telecommunication and Information Technology for a 
positive impact
in Individuals, Business Enterprises and Corporate Organizations.

:mikewanaswa@gmail.co
: Well spoken English and 

Work Experience
Date
Position
Employer
Duties

Date
Position
Employer
Duties

Date
Position
Employer
Duties

:April 2011 – ToDate
:Fixed Data NetworkTechnician
:Ben’s Electronics Services Ltd,Mombasa
: Survey, Installation, Integration, Maintenance, Support and 

Decommissioning of Fixed Data Services using various Access
Technologies (WIMAX, FIBER, MICROWAVES and Wi-Fi) for 
SafaricomLtd.

: Survey, Installation and Support of Ceragon’s IP20 Access Technology for 
Airtel (K)
: Survey , Installation and Support of Cambridge P2MP Solutions for 
Safaricom.
: Fiber Op

In [6]:
# Iterate files in directory:
path_to_pdf = '/home/roberta/code/rmeijden/targeted_resume/targeted_resume/csv-converter/original_resumes/pdf'
 
for filename in sorted(os.listdir(path_to_pdf)):
    if filename.endswith(".pdf"):        
        try:
            get_pdf_file_content(path_to_pdf + "/" + filename)
        except:
            print(filename)
            continue
            
    

1124.pdf
1648.pdf
3013.pdf
