In [None]:
import os
import shutil
import pandas as pd
from src.index_graph.configuration import IndexConfiguration

IndexConfiguration.images_dir
#from .configuration import Config


In [4]:

def register_image(
    source_png: str,
    pdf_file: str,
    page: int,
    image_index: int,
    caption: str = None
):
    """
    1. Copie le PNG depuis vos_rapport/rgph vers IMAGES_DIR/<pdf_file>/
    2. Ajoute une ligne dans charts_index.csv pour que l'indexeur le traite.
    """
    # 1) Détermine le dossier cible
    target_dir = os.path.join(IndexConfiguration.images_dir, pdf_file, f"page_{page}")
    os.makedirs(target_dir, exist_ok=True)

    # 2) Copie le fichier
    filename = f"img_{image_index}.png"
    dest_path = os.path.join(target_dir, filename)
    shutil.copy(source_png, dest_path)

    # 3) Met à jour le CSV
    df = pd.read_csv(IndexConfiguration.charts_index)
    new_row = {
        "pdf_file":     pdf_file,
        "page":         page,
        "image_index":  image_index,
        "width":        None,   # facultatif : vous pouvez calculer la taille via PIL si besoin
        "height":       None,
        "image_path":   dest_path,
        "caption":      caption or ""
    }
    df = df.append(new_row, ignore_index=True)
    df.to_csv(IndexConfiguration.charts_index, index=False)

    print(f"Image enregistrée et index mise à jour : {dest_path}")


In [None]:
register_image(
    source_png="vos_rapport/rgph/261a7100-dfa2-4754-ace7-f4424f537c17.png",
    pdf_file="RGPH-5-2023",
    page=14,
    image_index=1,
    caption="Répartition des occupés par secteur institutionnel selon le secteur d’activités"
)

In [13]:
# shared/utils.py
"""
Utilitaires partagés pour l'extraction d'images (graphiques) et de tables depuis les PDF
et création des index CSV correspondants avec légendes.
"""
import csv
import io
import re
from pathlib import Path

from PyPDF2 import PdfReader
from PyPDF2.generic import IndirectObject
from PIL import Image
import camelot  # pip install camelot-py[cv]

# Dossiers et fichiers fixes à la racine
IMAGES_DIR = Path("./images")
INDEX_IMAGES_CSV = Path("./charts_index.csv")
TABLES_DIR = Path("./tables")
INDEX_TABLES_CSV = Path("./tables_index.csv")


def extract_captions(text: str, prefix: str) -> list[str]:
    """
    Extrait les légendes commençant par prefix (ex. 'Graphique', 'Tableau')
    au début d'une ligne.
    """
    # Pattern multiline, début de ligne, capture jusqu'à fin de ligne
    pattern = re.compile(
        rf"^{prefix}\s+[\w\-]+\s*:\s*.+$",
        re.IGNORECASE | re.MULTILINE
    )
    return pattern.findall(text)


def extract_images(pdf_path: Path) -> list[tuple[int, str, Path, str]]:
    """
    Extrait les images (graphiques) d'un PDF en ignorant la première et la dernière image
    (entête/pied de page), associe la légende et nomme le fichier selon cette légende.
    Retourne: [(page, image_name, image_path, caption)].
    """
    images: list[tuple[int, str, Path, str]] = []
    reader = PdfReader(str(pdf_path))
    for page_num, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text() or ""
        captions = extract_captions(page_text, "Graphique")
        xobjs = page.get("/Resources", {}).get("/XObject", {})
        if not isinstance(xobjs, dict):
            continue
        items = list(xobjs.items())
        # Contenu utile sans entête/pied
        content = items[1:-1] if len(items) > 2 else items
        for idx, (name_key, ref) in enumerate(content, start=1):
            try:
                obj = ref.get_object() if isinstance(ref, IndirectObject) else ref
                if obj.get("/Subtype") == "/Image":
                    data = obj.get_data()
                    img = Image.open(io.BytesIO(data))
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    # Légende correspondante
                    caption = captions[idx-1] if idx-1 < len(captions) else ""
                    # Fichier nommé d'après la légende
                    safe = re.sub(r"[^\w\- ]", "", caption).strip().replace(" ", "_")
                    filename = f"{safe or pdf_path.stem}_p{page_num}_i{idx}.png"
                    out_path = IMAGES_DIR / filename
                    out_path.parent.mkdir(parents=True, exist_ok=True)
                    img.save(out_path)
                    images.append((page_num, filename, out_path, caption))
            except Exception:
                continue
    return images


def extract_tables(pdf_path: Path) -> list[tuple[int, int, Path, str]]:
    """
    Extrait les tableaux d'un PDF via Camelot, associe leur légende et nomme
    le fichier CSV selon la légende.
    Retourne: [(page, table_idx, table_path, caption)].
    """
    tables: list[tuple[int, int, Path, str]] = []
    reader = PdfReader(str(pdf_path))
    # Captions par page
    texts = [p.extract_text() or "" for p in reader.pages]
    caps_map = {i+1: extract_captions(texts[i], "Tableau") for i in range(len(texts))}
    TABLES_DIR.mkdir(parents=True, exist_ok=True)
    for flavor in ("lattice", "stream"):
        try:
            found = camelot.read_pdf(str(pdf_path), flavor=flavor, pages="all")
            for idx, table in enumerate(found, start=1):
                page = int(table.page)
                captions = caps_map.get(page, [])
                caption = captions[idx-1] if idx-1 < len(captions) else ""
                safe = re.sub(r"[^\w\- ]", "", caption).strip().replace(" ", "_")
                filename = f"{safe or pdf_path.stem}_p{page}_t{idx}.csv"
                out_path = TABLES_DIR / filename
                out_path.parent.mkdir(parents=True, exist_ok=True)
                table.to_csv(str(out_path))
                tables.append((page, idx, out_path, caption))
        except Exception:
            continue
    return tables


def generate_charts_index(pdf_source: Path) -> None:
    """
    Extrait et indexe tous les graphiques et tableaux d'un PDF ou dossier de PDFs.
    Génère deux CSV:
      - charts_index.csv (image_id, pdf_path, page, image_path, caption)
      - tables_index.csv (table_id, pdf_path, page, table_path, caption)
    """
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
    TABLES_DIR.mkdir(parents=True, exist_ok=True)
    with INDEX_IMAGES_CSV.open("w", newline="", encoding="utf-8") as imgf, \
         INDEX_TABLES_CSV.open("w", newline="", encoding="utf-8") as tblf:
        iw = csv.writer(imgf)
        iw.writerow(["image_id", "pdf_path", "page", "image_path", "caption"])
        tw = csv.writer(tblf)
        tw.writerow(["table_id", "pdf_path", "page", "table_path", "caption"])
        paths = ([pdf_source] if pdf_source.is_file() else list(pdf_source.rglob("*.pdf")))
        for pdf in paths:
            for page, name, path, cap in extract_images(pdf):
                iw.writerow([name.rsplit('.',1)[0], str(pdf), page, str(path), cap])
            for page, idx, path, cap in extract_tables(pdf):
                tw.writerow([path.stem, str(pdf), page, str(path), cap])
    print(f"[✔] Images indexées -> {INDEX_IMAGES_CSV}")
    print(f"[✔] Tables indexées -> {INDEX_TABLES_CSV}")

In [None]:
generate_charts_index(Path("/Users/fatousall/Documents/sun-stats/vos_rapports_rgph"))

  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generat