**Table of contents**<a id='toc0_'></a>    
- 1. [Using plumber](#toc1_)    
  - 1.1. [Get All pdf files tables](#toc1_1_)    

<!-- vscode-jupyter-toc-config
	numbering=true
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# 1. <a id='toc1_'></a>[Using plumber](#toc0_)

In [1]:

import pdfplumber
from pprint import pprint
import pandas as pd
from tqdm.notebook import tqdm
import os

## 1.1. <a id='toc1_1_'></a>[Get All pdf files tables](#toc0_)

In [2]:
# get all pdf file in a directory that contain keywords in there names
def get_pdfs_with_keyword(directory, keywords:list[str]):
    pdf_files_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                if any(keyword in file for keyword in keywords):
                    pdf_files_list.append(os.path.join(root, file))
    return pdf_files_list

In [3]:
def get_potable_water_data(potable_pdf_reprts):
    assert len(potable_pdf_reprts) > 0, "No pdf files Found!"
    print(f"{len(potable_pdf_reprts)} pdf files found")
    all_df = pd.DataFrame()

    for pdf_file in tqdm(potable_pdf_reprts):
        with pdfplumber.open(pdf_file) as pdf:
            tables = [page.extract_table() for page in pdf.pages[:2]]
            for table in tables[:]:
                df = pd.DataFrame(table)
                if (pdf_file[-8:-6] == "20"):
                    df["date"] = "".join([pdf_file[-14:-8] + pdf_file[-6:-4]])
                    # print("Woooow:","".join([pdf_file[-14:-8] + pdf_file[-6:-4]]))
                else:
                    df["date"] = pdf_file[-12:-4]
                    # print(pdf_file[-12:-4])
                all_df = pd.concat([all_df, df], axis=0)
    return all_df

In [4]:
# Replace 'your_directory_path' with the path to the directory you want to search
directory_path = r'../files'
keywords =['RADEES', 'RADESS']
potable_pdf_reprts = get_pdfs_with_keyword(directory_path, keywords)
pprint(potable_pdf_reprts)

['../files\\17-10-23-RADEES.pdf',
 '../files\\20-10-23-RADEES.pdf',
 '../files\\24-10-23-RADEES.pdf',
 '../files\\27-10-23-RADESS.pdf',
 '../files\\Modele rapport potablité Eau RADESS 04-12-23.pdf',
 '../files\\Modele rapport potablité Eau RADESS 05-12-23.pdf',
 '../files\\Rapport de potabilité RADEES 30-11-2023.pdf',
 '../files\\Rapport potabilité eau RADEES 01-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 02-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 03-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 04-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 05-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 06-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 07-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 08-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 09-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 10-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 11-11-23.pdf',
 '../files\\Rapport potabilité eau RADEES 12-11-23.pd

In [5]:
water_dataframe: pd.DataFrame = get_potable_water_data(potable_pdf_reprts[:])
water_dataframe.head()

268 pdf files found


  0%|          | 0/268 [00:00<?, ?it/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,date
0,Paramètre(s) microbiologie,,Méthode/Version,,,Résultat,,,unité,,,Incertitude\n%,,Critères,,Appréciation,3-RADEES
1,,,,,,,,,,,,,,microbiologiques,,,3-RADEES
2,,,,,,,,,,,,,,Marocains (1),,,3-RADEES
3,,,,,,,,,,,,,,(VMA),,,3-RADEES
4,Dénombrement de micro-\norganismes revivifiabl...,,NM ISO 6222 (2007),,,<1,,,ufc/1ml,,,0078,1.102,,,S,3-RADEES


In [69]:
try:
    df = water_dataframe.set_axis(water_dataframe.iloc[0], axis="columns").dropna(subset=["Résultat"]).query("Résultat != 'Résultat'")
except:
    df = water_dataframe.set_axis(water_dataframe.iloc[1], axis="columns").dropna(subset=["Résultat"])
    
df = df[df.columns.dropna()]

date_col = [col for col in df.columns if col.endswith(("23","24", "25", "RADEES"))][0]
df.rename(columns={date_col: "date"}, inplace=True)
df = df[[col for col in df.columns if col.startswith(("Param", "Résu", "date"))]]
df.insert(0, "date", df.pop('date'))
df.query("date != '3-RADEES'", inplace=True)
# drop rows where parametre coliumns start with param
df = df[df["Paramètre(s) microbiologie"].str.startswith("Param") == False]
df.head()

Unnamed: 0,date,Paramètre(s) microbiologie,Résultat
2,04-12-23,Benzo(b) fluorranthène*,10
3,04-12-23,Benzo(k) fluorranthène*,10
4,04-12-23,Benzo(ghi) pérylène*,10
5,04-12-23,Indénol(1.2.3-cd) pyrène*,10
6,04-12-23,Benzo(a) pyrène*,10


In [70]:
df.to_excel(r'../outputs/data.xlsx', index=False)