In [None]:
#Imports nécessaires
import tkinter as tk
import cv2
import pytesseract
import re
import mysql.connector
import numpy as np
from PIL import Image
from tkinter import filedialog, messagebox, ttk
from pdf2image import convert_from_path
import os


ModuleNotFoundError: No module named 'pytesseract'

In [None]:
#Fonction OCR : Extraction de texte depuis l'image
def choose_file():
    Tk().withdraw()  # Cacher la fenêtre principale Tkinter
    file_path = filedialog.askopenfilename(
        title="Choisir une facture",
        filetypes=[("Images et PDFs", "*.jpg *.jpeg *.png *.pdf")]
    )
    return file_path


In [None]:
#Prétraitement de l'image pour l'amélioration de l'OCR
def preprocess_image(file_path):
    filename = os.path.basename(file_path)
    name, ext = os.path.splitext(filename)
    output_folder = 'preprocessed'
    os.makedirs(output_folder, exist_ok=True)

    if ext.lower() == '.pdf':
        # Convertir la première page du PDF en image
        pages = convert_from_path(file_path)
        page = pages[0]
        image_path = os.path.join(output_folder, f"{name}_page1.png")
        page.save(image_path, 'PNG')
        print(f"[PDF] Première page convertie : {image_path}")
        img = cv2.imread(image_path)
    else:
        img = cv2.imread(file_path)

    if img is None:
        print("Erreur : Impossible de lire l'image.")
        return

    # Prétraitement
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    denoised = cv2.GaussianBlur(binary, (5, 5), 0)

    # Sauvegarde
    output_path = os.path.join(output_folder, f"{name}_preprocessed_cleaned.png")
    cv2.imwrite(output_path, denoised)
    print(f"Image prétraitée et sauvegardée : {output_path}")

    # Affichage
    Image.fromarray(denoised).show()


In [None]:
#Analyse et nettoyage du texte extrait
def extraire_texte_depuis_image(filepath):
    image = cv2.imread(filepath)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    binarized = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
    denoised_image = cv2.GaussianBlur(binarized, (5, 5), 0)
    preprocessed_image = Image.fromarray(denoised_image)
    preprocessed_image.save('facture_preprocessed_cleaned.png')

    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    image = Image.open('facture_preprocessed_cleaned.png')
    text = pytesseract.image_to_string(image)
    text = text.replace('Tota1', 'Total').replace('total', 'Total')

    return text


In [None]:
def detect_champs_individuels(text):
    sections = {"seller": [], "client": [], "iban": None, "tax_ids": []}
    current_section = None

    # Séparer les sections avec des marqueurs forts
    for line in text.split('\n'):
        line_clean = line.strip().lower()

        if "seller" in line_clean:
            current_section = "seller"
            continue
        elif "client" in line_clean:
            current_section = "client"
            continue
        elif "iban" in line_clean:
            # Extraire IBAN directement depuis la ligne
            iban_match = re.search(r'IBAN[:\s]*([A-Z0-9]{15,34})', line, re.IGNORECASE)
            if iban_match:
                sections["iban"] = iban_match.group(1).strip()

        # Collecter les données de la section active
        if current_section and line_clean:
            sections[current_section].append(line.strip())

        # Réinitialiser la section si ligne vide
        if not line_clean:
            current_section = None

    # Extraire les données du Seller et Client
    seller_data = "\n".join(sections["seller"])
    client_data = "\n".join(sections["client"])

    # Détection des Tax IDs avec regex améliorée
    tax_ids = re.findall(r'Tax Id[:\s]*([0-9\-]{8,12})', text, re.IGNORECASE)
    seller_tax_id = tax_ids[0] if len(tax_ids) >= 1 else ""
    client_tax_id = tax_ids[1] if len(tax_ids) >= 2 else ""

    # Séparation nom/adresse (suppose que la première ligne est le nom, les suivantes l'adresse)
    seller_name = sections["seller"][0] if len(sections["seller"]) > 0 else ""
    seller_address = "\n".join(sections["seller"][1:]) if len(sections["seller"]) > 1 else ""

    client_name = sections["client"][0] if len(sections["client"]) > 0 else ""
    client_address = "\n".join(sections["client"][1:]) if len(sections["client"]) > 1 else ""

    return seller_name, seller_address, seller_tax_id, client_name, client_address, client_tax_id, sections["iban"]

In [None]:
#Fonction principale : Analyse complète de la facture
def analyse_facture(filepath):
    text = extraire_texte_depuis_image(filepath)

    invoice_number_match = re.search(r'Invoice no[:\s]*([\d]+)', text)
    iban_match = re.search(r'IBAN[:\s]*([A-Z0-9]+)', text)
    date_match = re.search(r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b', text)
    if not date_match:
        date_match = re.search(r'Date of issue[:\s]*([A-Za-z0-9 ,.-]+)', text)
    lines = text.split('\n')
    seller_name, seller_address, seller_tax_id, client_name, client_address, client_tax_id, iban = detect_champs_individuels(text)

    for i, line in enumerate(lines):
        if re.search(r'(Seller|From|Vendor)', line, re.IGNORECASE):
            seller_name = lines[i+1].strip() if i+1 < len(lines) else ''
            seller_address = lines[i+2].strip() if i+2 < len(lines) else ''
            for j in range(i+1, i+5):
                if j < len(lines) and re.search(r'(Tax ID|VAT ID)', lines[j], re.IGNORECASE):
                    tax_search = re.search(r'(Tax ID|VAT ID)[:\s]*([A-Za-z0-9-]+)', lines[j])
                    if tax_search:
                        seller_tax_id = tax_search.group(2).strip()
                    break
            break

    for i, line in enumerate(lines):
        if re.search(r'(Client|To|Buyer)', line, re.IGNORECASE):
            client_name = lines[i+1].strip() if i+1 < len(lines) else ''
            client_address = lines[i+2].strip() if i+2 < len(lines) else ''
            for j in range(i+1, i+5):
                if j < len(lines) and re.search(r'(Tax ID|VAT ID)', lines[j], re.IGNORECASE):
                    tax_search = re.search(r'(Tax ID|VAT ID)[:\s]*([A-Za-z0-9-]+)', lines[j])
                    if tax_search:
                        client_tax_id = tax_search.group(2).strip()
                    break
            break

    invoice_number = invoice_number_match.group(1) if invoice_number_match else ''
    iban = iban_match.group(1) if iban_match else ''
    date = date_match.group(1).strip() if date_match else ''

    montants = re.findall(r'(\d{1,3}(?:[.,\s]\d{3})*(?:[.,]\d{2}))', text)
    montants_candidats = []
    for m in montants:
        cleaned = m.replace(',', '.').replace(' ', '').replace('$', '')
        try:
            montants_candidats.append(float(cleaned))
        except:
            continue
    total = max(montants_candidats) if montants_candidats else 0.0
    total_match = re.search(r'Total\s*\$?\s*([0-9,]+\.\d{2})', text)
    if total_match:
        total = float(total_match.group(1).replace(',', ''))

    conn = mysql.connector.connect(
        host="localhost",
        user="root",
        password="ayaDATA2025@",
        database="extraction_factures"
    )
    cursor = conn.cursor()

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS sellers (
            id INT AUTO_INCREMENT PRIMARY KEY,
            name VARCHAR(255),
            address TEXT,
            tax_id VARCHAR(255)
        )
    """)

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS clients (
            id INT AUTO_INCREMENT PRIMARY KEY,
            name VARCHAR(255),
            address TEXT,
            tax_id VARCHAR(255)
        )
    """)

    cursor.execute("""
        CREATE TABLE IF NOT EXISTS invoices (
            id INT AUTO_INCREMENT PRIMARY KEY,
            invoice_number VARCHAR(255),
            date DATE,
            iban VARCHAR(255),
            total DECIMAL(10, 2),
            seller_id INT,
            client_id INT,
            FOREIGN KEY (seller_id) REFERENCES sellers(id),
            FOREIGN KEY (client_id) REFERENCES clients(id)
        )
    """)

    cursor.execute("SELECT id FROM sellers WHERE name=%s AND tax_id=%s", (seller_name, seller_tax_id))
    seller = cursor.fetchone()
    if seller:
        seller_id = seller[0]
    else:
        cursor.execute("INSERT INTO sellers (name, address, tax_id) VALUES (%s, %s, %s)", (seller_name, seller_address, seller_tax_id))
        seller_id = cursor.lastrowid

    cursor.execute("SELECT id FROM clients WHERE name=%s AND tax_id=%s", (client_name, client_tax_id))
    client = cursor.fetchone()
    if client:
        client_id = client[0]
    else:
        cursor.execute("INSERT INTO clients (name, address, tax_id) VALUES (%s, %s, %s)", (client_name, client_address, client_tax_id))
        client_id = cursor.lastrowid

    cursor.execute("INSERT INTO invoices (invoice_number, date, iban, total, seller_id, client_id) VALUES (%s, %s, %s, %s, %s, %s)",
                   (invoice_number, date, iban, total, seller_id, client_id))

    conn.commit()
    cursor.close()
    conn.close()
    print("Analyse terminée avec succès.")


In [None]:
#Choisir un fichier et lancer l'analyse de la facture
def choisir_fichier(output_text, status_label, file_label):
    filepath = filedialog.askopenfilename()
    if filepath:
        file_label.config(text=filepath)
        try:
            analyse_facture(filepath)
            output_text.insert(tk.END, f"Analyse terminée pour : {filepath}\n")
            status_label.config(text="Analyse réussie.")
        except Exception as e:
            output_text.insert(tk.END, f"Erreur : {str(e)}\n")
            status_label.config(text="Erreur lors de l'analyse.")


In [None]:
#Interface user
root = tk.Tk()
root.title("Analyse automatique de factures")
root.geometry("600x400")
root.resizable(False, False)

style = ttk.Style()
style.theme_use("clam")

title = ttk.Label(root, text="Analyseur de Factures", font=("Helvetica", 16, "bold"))
title.pack(pady=10)

file_label = ttk.Label(root, text="Aucun fichier sélectionné")
file_label.pack()

output_text = tk.Text(root, height=10, width=70, wrap="word", bg="#f0f0f0")
output_text.pack(padx=10, pady=10)

btn_choose_file = ttk.Button(root, text="Choisir une image", command=lambda: choisir_fichier(output_text, status_label, file_label))
btn_choose_file.pack(pady=5)

status_label = ttk.Label(root, text="")
status_label.pack(pady=5)

root.mainloop()


Analyse terminée avec succès.
