**Table of contents**<a id='toc0_'></a>    
- 1. [Using camelot-py](#toc1_)    
  - 1.1. [Get tables from pdf file](#toc1_1_)    
  - 1.2. [Insight Extracted Tables](#toc1_2_)    
  - 1.3. [View a table as Data Frame](#toc1_3_)    
  - 1.4. [Export tables in csv format into zip file](#toc1_4_)    
  - 1.5. [Tables Processing](#toc1_5_)    
- 2. [Using plummber](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
# import libraries
import pandas as pd
import camelot

# 1. <a id='toc1_'></a>[Using camelot-py](#toc0_)

## 1.1. <a id='toc1_1_'></a>[Get tables from pdf file](#toc0_)


In [2]:
url_file = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"


def get_table_from_pdf(
    path_file: str = url_file, method="lattice"):
    # Get the tables in the PDF file
    if path_file.endswith(".pdf"):
        try:
            tables = camelot.read_pdf(filepath=path_file, pages="all", flavor=method)
            tables.export(r'foo.csv', f='csv', compress=True)
            print(f"{path_file} successfully loaded!")
            if len(tables) > 0:
                print(f"{len(tables)} tables extracted")
                return tables
            else:
                raise Exception("No table extracted!")
        except Exception as e:
            print(e)
            print("Try change -method- parameter to 'stream'!")
            # tables = camelot.read_pdf(filepath=url_file, pages="all", flavor="lattice")
    else:
        raise Exception("File is not a PDF")


In [18]:
# set path to pdf file
pdf_file = r"../files/Rapport potablité Eau RADESS 01-04-24.pdf"

tables = get_table_from_pdf(path_file=pdf_file, method="stream")


../files/Rapport potablité Eau RADESS 01-04-24.pdf successfully loaded!
10 tables extracted


## 1.2. <a id='toc1_2_'></a>[Insight Extracted Tables](#toc0_)


In [4]:
def display_tables_info(tables) -> pd.DataFrame:
    assert len(tables) > 0, "No tables Found!"
    table_infos = [
        table.parsing_report | {"n_rows": table.shape[0], "n_cols": table.shape[1]}
        for table in tables
    ]
    return pd.DataFrame(table_infos)


display(display_tables_info(tables))


Unnamed: 0,accuracy,whitespace,order,page,n_rows,n_cols
0,92.77,64.29,1,1,32,7
1,97.62,43.45,2,1,21,8
2,99.61,29.62,1,2,46,8
3,99.5,28.57,1,3,35,8
4,99.53,28.35,1,4,58,9
5,99.84,12.73,1,5,48,9
6,99.85,5.05,1,6,44,9
7,99.84,8.94,1,7,46,9
8,99.74,4.94,1,8,45,9
9,100.0,7.94,1,9,21,3


## 1.3. <a id='toc1_3_'></a>[View a table as Data Frame](#toc0_)


In [14]:
# print the first table as Pandas DataFrame
df_table = tables[2].df
display(df_table)


Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,Rapport N° :185/24-PS,,,,
1,,,,,,Incertitude,,
2,Paramètre au laboratoire,Méthode/Version,Résultat,Unité,LQ,,VMA*,Appréciation
3,,,,,,(%),,
4,pH*,NM ISO 10523 (2012),75,UpH,-,64,"6,5 - 8,5",S
5,,,,µS/cm à,,,,
6,Conductivité électrique*,NM ISO 7888 (2001),802,,-,154,2700,S
7,,,,20°C,,,,
8,Couleur réelle*,NM ISO 7887 (2012),ND,Pt mg/l,-,177,20,S
9,Odeur,NM 03.7.16 (1990),"1,5eme seuil",-,-,10,3,S


## 1.4. <a id='toc1_4_'></a>[Export tables in csv format into zip file](#toc0_)


In [6]:
tables.export(
    r"../outputs/data.csv", f="csv", compress=True
)  # json, excel, html, markdown, sqlite


## 1.5. <a id='toc1_5_'></a>[Tables Processing](#toc0_)

In [15]:
df_table:pd.DataFrame = tables[0].df.copy()
cols = df_table.iloc[6,:]

df_table['date'] = pd.to_datetime(pdf_file[-12:-4])
df_table.set_index('date', inplace=True)
df_table.columns = cols
df_table = df_table.dropna(subset=['Paramètre(s) microbiologiques', 'Résultat'])

df_table

KeyError: ['Paramètre(s) microbiologiques', 'Résultat']

# 2. <a id='toc2_'></a>[Using plumber](#toc0_)

In [8]:
import pdfplumber
from pprint import pprint
import pandas as pd
from tqdm.autonotebook import tqdm

In [None]:

with pdfplumber.open(pdf_file) as pdf:
    tables = [page.extract_table() for page in tqdm(pdf.pages[:3])]

In [None]:
all_df = pd.DataFrame()
for table in tables[:2]:
    print(f"{'table':=^100}")
    df = pd.DataFrame(table)
    df['date'] = pdf_file[-12:-4]
    all_df = pd.concat([all_df, df], axis=0)
all_df

# Get All pdf files tables

In [18]:
# get all pdf file in a directory
import os

def find_pdfs_with_keyword(directory, keyword):
    files_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf") and (keyword in file or 'RADESS' in file):
                files_list.append(os.path.join(root, file))
    return files_list

# Replace 'your_directory_path' with the path to the directory you want to search
directory_path = r'../files'
keyword = 'RADEES'
find_pdfs_with_keyword(directory_path, keyword)[:4]

['../files\\Rapport potablité Eau RADESS 01-04-24.pdf',
 '../files\\Rapport potablité Eau RADESS 02-04-24.pdf',
 '../files\\Rapport potablité Eau RADESS 03-04-24.pdf',
 '../files\\Rapport potablité Eau RADESS 04-04-24.pdf']

In [19]:
directory_path = r'../files'
keyword = 'RADEES'
all_df = pd.DataFrame()

for pdf_file in tqdm(find_pdfs_with_keyword(directory_path, keyword)):
    with pdfplumber.open(pdf_file) as pdf:
        tables = [page.extract_table() for page in pdf.pages[:2]]
        for table in tables[:]:
            df = pd.DataFrame(table)
            df['date'] = pdf_file[-12:-4]

            all_df = pd.concat([all_df, df], axis=0)


100%|██████████| 33/33 [00:28<00:00,  1.15it/s]


In [20]:
all_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,date
0,Hydrocarbures polycycliques aromatiques (HAP),,,,,,,,,01-04-24
1,Paramètre au laboratoire,Méthode/Version,Résultat,Unité,LQ,Incertitude\n(%),,VMA*,Appréciation,01-04-24
2,Benzo(b) fluorranthène*,NM ISO 28540 (2014),<LQ,µg/l,001,10,,01,S,01-04-24
3,Benzo(k) fluorranthène*,,<LQ,µg/l,001,10,,01,S,01-04-24
4,Benzo(ghi) pérylène*,,<LQ,µg/l,001,10,,01,S,01-04-24
...,...,...,...,...,...,...,...,...,...,...
28,Manganèse* (Mn),,<LQ,mg/l,00005,18,01,S,,26-04-24
29,Zinc* (Zn),,0006,mg/l,0005,18,3,S,,26-04-24
30,Fer* (Fe),NM ISO 11885 (2014),<LQ,mg/l,0005,18,03,-,,26-04-24
31,Cyanures,Méthode\nPotentiométrique,<LQ,µg/l,10,15,70,S,,26-04-24


In [21]:
all_df.to_excel(r'../outputs/data.xlsx')