In [1]:
import os
print("Chemin actuel :", os.getcwd())


Chemin actuel : c:\Users\cheim\ml


In [2]:
pip install opencv-python-headless opencv-contrib-python


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install pdf2image




In [1]:
import os
from pdf2image import convert_from_path
import cv2
import pytesseract
from PIL import Image
import json
from tqdm import tqdm

# Chemins des dossiers
PDF_FOLDER = "manga_dataset"  # Dossier contenant les fichiers PDF
OUTPUT_FOLDER = "extracted_images"  # Dossier pour les images extraites
PREPROCESSED_FOLDER = "preprocessed_images"  # Dossier pour les images prétraitées
ANNOTATION_FILE = "annotations.json"  # Fichier JSON pour stocker les annotations

# Créer les dossiers si nécessaires
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(PREPROCESSED_FOLDER, exist_ok=True)

# 1. Extraction des images depuis les PDF
def extract_images_from_pdfs(pdf_folder, output_folder):
    print("Extraction des images depuis les PDF...")
    for pdf_file in tqdm(os.listdir(pdf_folder)):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            images = convert_from_path(pdf_path)
            for i, image in enumerate(images):
                image_name = f"{os.path.splitext(pdf_file)[0]}_page_{i+1}.jpg"
                image_path = os.path.join(output_folder, image_name)
                image.save(image_path, "JPEG")

# 2. Prétraitement des images
def preprocess_images(input_folder, output_folder):
    print("Prétraitement des images...")
    for image_file in tqdm(os.listdir(input_folder)):
        if image_file.endswith(".jpg"):
            image_path = os.path.join(input_folder, image_file)
            image = cv2.imread(image_path)
            
            # Redimensionner l'image
            resized_image = cv2.resize(image, (256, 256))
            
            # Convertir en niveaux de gris
            gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
            
            # Sauvegarder l'image prétraitée
            output_path = os.path.join(output_folder, image_file)
            cv2.imwrite(output_path, gray_image)

# 3. Extraction du texte avec OCR
def extract_text_with_ocr(input_folder):
    print("Extraction du texte via OCR...")
    annotations = []
    for image_file in tqdm(os.listdir(input_folder)):
        if image_file.endswith(".jpg"):
            image_path = os.path.join(input_folder, image_file)
            image = Image.open(image_path)
            
            # Extraire le texte avec Tesseract OCR
            text = pytesseract.image_to_string(image, lang="eng")  # Ajustez "lang" selon la langue
            annotations.append({"image": image_file, "text": text})
    
    # Sauvegarder les annotations dans un fichier JSON
    with open(ANNOTATION_FILE, "w", encoding="utf-8") as f:
        json.dump(annotations, f, indent=4, ensure_ascii=False)

# 4. Pipeline complet
def main():
    # Étape 1 : Extraire les images des PDF
    extract_images_from_pdfs(PDF_FOLDER, OUTPUT_FOLDER)
    
    # Étape 2 : Prétraiter les images
    preprocess_images(OUTPUT_FOLDER, PREPROCESSED_FOLDER)
    
    # Étape 3 : Extraire le texte avec OCR
    extract_text_with_ocr(PREPROCESSED_FOLDER)
    
    print(f"Pipeline terminé. Les annotations sont sauvegardées dans {ANNOTATION_FILE}.")

# Lancer le script
if __name__ == "__main__":
    main()


Extraction des images depuis les PDF...


100%|██████████| 10/10 [04:15<00:00, 25.52s/it]


Prétraitement des images...


100%|██████████| 530/530 [00:21<00:00, 24.22it/s]


Extraction du texte via OCR...


100%|██████████| 530/530 [00:55<00:00,  9.61it/s]

Pipeline terminé. Les annotations sont sauvegardées dans annotations.json.





In [9]:
pip install pytesseract


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install tqdm


Collecting tqdm
  Downloading tqdm-4.67.0-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.0-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.0
Note: you may need to restart the kernel to use updated packages.


## prétraitement des images déjà extracté du pdf 

In [2]:
import os
import cv2
import pytesseract
from PIL import Image
import json
from tqdm import tqdm

# Chemins des dossiers
MANGA_FOLDER = "manga_dataset"  # Racine contenant tous les mangas et leurs chapitres
PREPROCESSED_FOLDER = "preprocessed_images"  # Dossier pour les images prétraitées
ANNOTATION_FILE = "annotations.json"  # Fichier JSON pour stocker les annotations

# Créer les dossiers si nécessaires
os.makedirs(PREPROCESSED_FOLDER, exist_ok=True)

# 1. Prétraitement des images
def preprocess_images(input_folder, output_folder):
    print("Prétraitement des images...")
    for manga_name in os.listdir(input_folder):
        manga_path = os.path.join(input_folder, manga_name)
        if os.path.isdir(manga_path):  # Vérifie si c'est un dossier
            for chapter_name in os.listdir(manga_path):
                chapter_path = os.path.join(manga_path, chapter_name)
                if os.path.isdir(chapter_path):  # Vérifie si c'est un dossier
                    for image_file in tqdm(os.listdir(chapter_path)):
                        if image_file.endswith(".jpg"):
                            image_path = os.path.join(chapter_path, image_file)
                            image = cv2.imread(image_path)
                            
                            # Redimensionner l'image
                            resized_image = cv2.resize(image, (256, 256))
                            
                            # Convertir en niveaux de gris
                            gray_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
                            
                            # Conserver la structure des dossiers dans l'output
                            relative_path = os.path.relpath(chapter_path, input_folder)
                            output_dir = os.path.join(output_folder, relative_path)
                            os.makedirs(output_dir, exist_ok=True)
                            output_path = os.path.join(output_dir, image_file)
                            cv2.imwrite(output_path, gray_image)

# 2. Extraction du texte avec OCR
def extract_text_with_ocr(input_folder):
    print("Extraction du texte via OCR...")
    annotations = []
    for manga_name in os.listdir(input_folder):
        manga_path = os.path.join(input_folder, manga_name)
        if os.path.isdir(manga_path):  # Vérifie si c'est un dossier
            for chapter_name in os.listdir(manga_path):
                chapter_path = os.path.join(manga_path, chapter_name)
                if os.path.isdir(chapter_path):  # Vérifie si c'est un dossier
                    for image_file in tqdm(os.listdir(chapter_path)):
                        if image_file.endswith(".jpg"):
                            image_path = os.path.join(chapter_path, image_file)
                            image = Image.open(image_path)
                            
                            # Extraire le texte avec Tesseract OCR
                            text = pytesseract.image_to_string(image, lang="eng")  # Ajustez "lang" selon la langue
                            
                            # Conserver la hiérarchie des dossiers dans l'annotation
                            relative_path = os.path.relpath(chapter_path, input_folder)
                            annotations.append({
                                "manga": manga_name,
                                "chapter": chapter_name,
                                "image": os.path.join(relative_path, image_file),
                                "text": text
                            })
    
    # Sauvegarder les annotations dans un fichier JSON
    with open(ANNOTATION_FILE, "w", encoding="utf-8") as f:
        json.dump(annotations, f, indent=4, ensure_ascii=False)

# 3. Pipeline complet
def main():
    # Étape 1 : Prétraiter les images
    preprocess_images(MANGA_FOLDER, PREPROCESSED_FOLDER)
    
    # Étape 2 : Extraire le texte avec OCR
    extract_text_with_ocr(PREPROCESSED_FOLDER)
    
    print(f"Pipeline terminé. Les annotations sont sauvegardées dans {ANNOTATION_FILE}.")

# Lancer le script
if __name__ == "__main__":
    main()


Prétraitement des images...


100%|██████████| 28/28 [00:00<00:00, 40.85it/s]
100%|██████████| 16/16 [00:00<00:00, 39.33it/s]
100%|██████████| 15/15 [00:00<00:00, 38.01it/s]
100%|██████████| 54/54 [00:00<00:00, 77.76it/s]
100%|██████████| 23/23 [00:00<00:00, 79.75it/s]
100%|██████████| 25/25 [00:00<00:00, 85.88it/s]
100%|██████████| 22/22 [00:00<00:00, 77.51it/s]
100%|██████████| 20/20 [00:00<00:00, 76.77it/s]
100%|██████████| 54/54 [00:00<00:00, 74.47it/s]
100%|██████████| 50/50 [00:00<00:00, 69.05it/s]
100%|██████████| 46/46 [00:00<00:00, 74.69it/s]
100%|██████████| 19/19 [00:00<00:00, 66.83it/s]
100%|██████████| 16/16 [00:00<00:00, 52.45it/s]


Extraction du texte via OCR...


100%|██████████| 28/28 [00:02<00:00, 10.42it/s]
100%|██████████| 16/16 [00:01<00:00, 10.74it/s]
100%|██████████| 15/15 [00:01<00:00, 10.43it/s]
100%|██████████| 54/54 [00:05<00:00, 10.55it/s]
100%|██████████| 23/23 [00:02<00:00, 10.41it/s]
100%|██████████| 25/25 [00:02<00:00, 10.19it/s]
100%|██████████| 22/22 [00:02<00:00,  9.79it/s]
100%|██████████| 20/20 [00:02<00:00,  9.93it/s]
100%|██████████| 54/54 [00:05<00:00, 10.09it/s]
100%|██████████| 50/50 [00:04<00:00, 10.41it/s]
100%|██████████| 46/46 [00:04<00:00, 10.47it/s]
100%|██████████| 19/19 [00:01<00:00, 10.30it/s]
100%|██████████| 16/16 [00:01<00:00, 10.48it/s]

Pipeline terminé. Les annotations sont sauvegardées dans annotations.json.





In [None]:
import os
from PIL import Image
from pytesseract import image_to_string

IMAGE_FOLDER = "manga_dataset"
OUTPUT_DATA = []

def extract_text(image_path):
    try:
        return image_to_string(Image.open(image_path), lang="eng")  # Changez 'eng' si le texte est dans une autre langue
    except Exception as e:
        print(f"Erreur lors de l'extraction de texte pour {image_path}: {e}")
        return ""

for manga in os.listdir(IMAGE_FOLDER):
    manga_path = os.path.join(IMAGE_FOLDER, manga)
    if os.path.isdir(manga_path):
        for chapter in os.listdir(manga_path):
            chapter_path = os.path.join(manga_path, chapter)
            if os.path.isdir(chapter_path):
                for image_file in os.listdir(chapter_path):
                    image_path = os.path.join(chapter_path, image_file)
                    if image_file.lower().endswith((".png", ".jpg", ".jpeg")):
                        text = extract_text(image_path)
                        OUTPUT_DATA.append({
                            "manga": manga,
                            "chapter": chapter,
                            "image": os.path.relpath(image_path),
                            "text": text.strip()
                        })

# Exportez les résultats au format JSON si nécessaire
import json
with open("output.json", "w", encoding="utf-8") as f:
    json.dump(OUTPUT_DATA, f, ensure_ascii=False, indent=4)
