In [20]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import HTMLConverter, TextConverter, XMLConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage

In [33]:
def get_pdf_file_content_Text(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [28]:
def get_pdf_file_content_HTML(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = HTMLConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

In [29]:
def get_pdf_file_content_XML(path_to_pdf):
    
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    
    resource_manager = PDFResourceManager(caching=True)
    
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = XMLConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)

    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=0, password="", caching=True, check_extractable=True):
        interpreter.process_page(page)

    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()

    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

## Single Row

In [30]:
pdf_path = 'Data/ast_sci_data_tables_sample.pdf'

In [31]:
print(get_pdf_file_content_HTML(pdf_path))

<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:53px; top:101px; width:506px; height:118px;"><span style="font-family: FNCIUM+Verdana-Bold; font-size:13px">NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING
<br></span><span style="font-family: FNCIUM+Verdana; font-size:13px">www.sedl.org/afterschool/toolkits
<br></span><span style="font-family: EFSCGO+Futura-ExtraBold; font-size:16px">����������� �������� �������
<br></span><span style="font-family: FNCIUM+Verdana-Bold; font-size:16px">Tutoring to Enhance Science Skills
<br></span><span style="font-family: FNCIUM+Verdana; font-size:15px">Tutoring Two:</span><span style="font-family: FNCIUM+Verdana; font-size:16px"> </span><sp

In [34]:
print(get_pdf_file_content_Text(pdf_path))

NATIONAL PARTNERSHIP FOR QUALITY AFTERSCHOOL LEARNING
www.sedl.org/afterschool/toolkits
����������� �������� �������
Tutoring to Enhance Science Skills
Tutoring Two: Learning to Make Data Tables
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Sample Data for Data Tables

Use these data to create data tables following the Guidelines for Making a Data Table and 
Checklist for a Data Table.

Example 1: Pet Survey (GR 2–3)
Ms. Hubert’s afterschool students took a survey of the 600 students at Morales Elementary 
School. Students were asked to select their favorite pet from a list of eight animals. Here 
are the results. 

Lizard 25, Dog 250, Cat 115, Bird 50, Guinea pig 30, Hamster 45, Fish 75, 
Ferret 10 

Example 2: Electromagnets—Increasing Coils (GR 3–5)
The following data were collected using an electromagnet with a 1.5 volt battery, a switch, 
a 

In [35]:
print(get_pdf_file_content_XML(pdf_path))

<?xml version="1.0" encoding="utf-8" ?>
<pages>
<page id="1" bbox="0.000,0.000,612.000,792.000" rotate="0">
<textbox id="0" bbox="53.000,622.501,559.200,740.589">
<textline bbox="55.000,727.301,410.548,740.589">
<text font="FNCIUM+Verdana-Bold" bbox="55.000,727.301,63.385,740.589" size="13.288">N</text>
<text font="FNCIUM+Verdana-Bold" bbox="63.138,727.301,70.820,740.589" size="13.288">A</text>
<text font="FNCIUM+Verdana-Bold" bbox="70.573,727.301,77.324,740.589" size="13.288">T</text>
<text font="FNCIUM+Verdana-Bold" bbox="77.077,727.301,82.482,740.589" size="13.288">I</text>
<text font="FNCIUM+Verdana-Bold" bbox="82.235,727.301,90.650,740.589" size="13.288">O</text>
<text font="FNCIUM+Verdana-Bold" bbox="90.402,727.301,98.788,740.589" size="13.288">N</text>
<text font="FNCIUM+Verdana-Bold" bbox="98.540,727.301,106.223,740.589" size="13.288">A</text>
<text font="FNCIUM+Verdana-Bold" bbox="105.975,727.301,112.281,740.589" size="13.288">L</text>
<text font="FNCIUM+Verdana-Bold" bbox="11

## Double Columns

In [6]:
pdf_path_2 = 'Data/loose_vertical_table.pdf'

In [7]:
print(get_pdf_file_content(pdf_path_2))

Environmental Earth Sciences          (2021) 80:488  
https://doi.org/10.1007/s12665-021-09775-4

ORIGINAL ARTICLE

Nb–Sr–Pb isotope analysis in soils of abandoned mercury quarry 
in northwest Black Sea (Turkey), soil and plant geochemistry, 
evaluation of ecological risk and its ımpact on human health

Bilgehan Yabgu Horasan1 

 · Alican Ozturk2 · Osman Tugay3

Received: 27 February 2021 / Accepted: 9 July 2021 
© The Author(s), under exclusive licence to Springer-Verlag GmbH Germany, part of Springer Nature 2021

Abstract
Potential toxic element accumulation in soils and plants is one of the leading environmental problems in recent years. In 
many countries, mining enterprises are generally abandoned for reasons such as increasing costs, depleted reserves, and 
changes in ore quality. The negative effects on the environment during or as a result of these activities are known. The focus 
is on investigating the Hg concentration accumulated in the soil and plants around the abandoned m