In [None]:
import os
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm

# ------------------------
# Percorsi origine
# ------------------------
iam_root = "../../IAMdataset"     # cartella IAM con sottocartelle numeriche
rimes_root = "../../Images_Courriers" # cartella RIMES con DVD*_TIF

# Percorso destinazione
merged_root = "../../merged_dataset"
os.makedirs(merged_root, exist_ok=True)

# ------------------------
# 1. Copia IAM così com’è
# ------------------------
print("📂 Copio IAM...")
for author_id in tqdm(os.listdir(iam_root), desc="IAM"):
    src_dir = os.path.join(iam_root, author_id)
    if not os.path.isdir(src_dir):
        continue
    
    dst_dir = os.path.join(merged_root, author_id)
    os.makedirs(dst_dir, exist_ok=True)

    for fname in os.listdir(src_dir):
        src_file = os.path.join(src_dir, fname)
        dst_file = os.path.join(dst_dir, fname)
        if not os.path.exists(dst_file):
            shutil.copy(src_file, dst_file)

# ------------------------
# 2. Funzione per autore da XML RIMES
# ------------------------
def extract_writer_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        writer = root.findtext("writer")   # 👈 cambiato da IdWriter a writer
        if writer is None:
            return "rimes_unknown"
        return "rimes_" + writer.strip().replace(" ", "_")
    except Exception as e:
        print(f"⚠️ Errore parsing {xml_path}: {e}")
        return "rimes_unknown"

# ------------------------
# 3. Copia RIMES in formato IAM-like
# ------------------------
print("📂 Copio RIMES...")
rimes_subdirs = ["DVD1_TIF", "DVD2_TIF", "DVD3_TIF"]

for subdir in rimes_subdirs:
    dir_path = os.path.join(rimes_root, subdir)
    if not os.path.exists(dir_path):
        continue
    
    for fname in tqdm(os.listdir(dir_path), desc=f"RIMES {subdir}"):
        if not fname.lower().endswith(".tif"):
            continue
        
        base = fname[:-4]
        tif_path = os.path.join(dir_path, fname)
        xml_path = os.path.join(dir_path, base + ".xml")

        if os.path.exists(xml_path):
            writer_id = extract_writer_from_xml(xml_path)
        else:
            writer_id = "rimes_unknown"

        out_dir = os.path.join(merged_root, writer_id)
        os.makedirs(out_dir, exist_ok=True)

        dst_file = os.path.join(out_dir, fname)
        if not os.path.exists(dst_file):
            shutil.copy(tif_path, dst_file)

print("✅ Merge completato! Cartella finale:", merged_root)


📂 Copio IAM...


IAM: 100%|██████████| 657/657 [00:25<00:00, 26.12it/s]


📂 Copio RIMES...


RIMES DVD1_TIF: 100%|██████████| 8378/8378 [03:39<00:00, 38.15it/s]
RIMES DVD2_TIF: 100%|██████████| 8349/8349 [03:57<00:00, 35.20it/s]
RIMES DVD3_TIF: 100%|██████████| 8494/8494 [04:25<00:00, 31.97it/s]

✅ Merge completato! Cartella finale: ../../merged_dataset





In [2]:
import os
import shutil
from tqdm import tqdm

# ----------------------------
# Percorsi dataset
# ----------------------------
iam_root = "data"                # già organizzato per writer
rimes_root = "../../RIMES_cropped"    # creato dallo script precedente
merged_root = "../merged_dataset"  # dataset finale

# Ricrea cartella merged
if os.path.exists(merged_root):
    shutil.rmtree(merged_root)
os.makedirs(merged_root, exist_ok=True)

# ----------------------------
# Copia IAM
# ----------------------------
print("📥 Copio IAM...")
for writer in tqdm(os.listdir(iam_root)):
    src = os.path.join(iam_root, writer)
    dst = os.path.join(merged_root, "iam_" + writer)  # prefisso per distinguere
    if os.path.isdir(src):
        shutil.copytree(src, dst)

# ----------------------------
# Copia RIMES
# ----------------------------
print("📥 Copio RIMES (crop)...")
for writer in tqdm(os.listdir(rimes_root)):
    src = os.path.join(rimes_root, writer)
    dst = os.path.join(merged_root, writer)  # hanno già prefisso rimes_
    if os.path.isdir(src):
        shutil.copytree(src, dst)

print("✅ Merge completato!")
print("Cartella finale:", merged_root)


📥 Copio IAM...


100%|██████████| 657/657 [00:34<00:00, 19.20it/s]


📥 Copio RIMES (crop)...


0it [00:00, ?it/s]

✅ Merge completato!
Cartella finale: ../merged_dataset





In [22]:
import os
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm

# ------------------------
# Cartelle sorgenti
# ------------------------
iam_root = "../../IAM_blocchi"                 # IAM con sottocartelle numeriche
rimes_xml_root = "../../Images_Courriers"    # RIMES XML
rimes_img_root = "../../images_blocs_de_texte"  # RIMES immagini ritagliate

# Cartella di destinazione merged
merged_root = "../../merged_dataset"
os.makedirs(merged_root, exist_ok=True)

# ------------------------
# 1️⃣ Copia IAM così com'è
# ------------------------
print("📂 Copio IAM...")
for author_id in tqdm(os.listdir(iam_root), desc="IAM"):
    src_dir = os.path.join(iam_root, author_id)
    if not os.path.isdir(src_dir):
        continue

    dst_dir = os.path.join(merged_root, author_id)
    os.makedirs(dst_dir, exist_ok=True)

    for fname in os.listdir(src_dir):
        src_file = os.path.join(src_dir, fname)
        dst_file = os.path.join(dst_dir, fname)
        if not os.path.exists(dst_file):
            shutil.copy(src_file, dst_file)

# ------------------------
# 2️⃣ Funzione per estrarre writer da XML RIMES
# ------------------------
def extract_writer_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        writer = root.findtext("writer")  # usa il tag corretto
        if writer is None:
            return "rimes_unknown"
        return "rimes_" + writer.strip().replace(" ", "_")
    except Exception:
        return "rimes_unknown"

# ------------------------
# 3️⃣ Copia RIMES
# ------------------------
print("📂 Copio RIMES...")

dvd_map = {
    "DVD1_TIF": "DVD1",
    "DVD2_TIF": "DVD2",
    "DVD3_TIF": "DVD3",
}

for xml_subdir, img_subdir in dvd_map.items():
    xml_dir = os.path.join(rimes_xml_root, xml_subdir)
    if not os.path.exists(xml_dir):
        continue
    
    img_dir = os.path.join(rimes_img_root, img_subdir)
    if not os.path.exists(img_dir):
        continue

    for xml_file in tqdm(os.listdir(xml_dir), desc=f"RIMES {xml_subdir}"):
        if not xml_file.endswith("_L.xml"):  # <-- Salta XML non "L"
            continue

        base_name = os.path.splitext(xml_file)[0]
        xml_path = os.path.join(xml_dir, xml_file)

        writer_id = extract_writer_from_xml(xml_path)

        img_file = base_name + ".jpg"
        img_path = os.path.join(img_dir, img_file)
        if not os.path.exists(img_path):
            print(f"⚠️ Nessuna immagine per {xml_file} in {img_dir}")
            continue

        # Crea cartella writer e copia immagine
        out_dir = os.path.join(merged_root, writer_id)
        os.makedirs(out_dir, exist_ok=True)
        dst_file = os.path.join(out_dir, img_file)
        if not os.path.exists(dst_file):
            shutil.copy(img_path, dst_file)

print("✅ Merge IAM + RIMES completato in:", merged_root)


📂 Copio IAM...


IAM: 100%|██████████| 657/657 [00:39<00:00, 16.75it/s]


📂 Copio RIMES...


RIMES DVD1_TIF:  26%|██▋       | 2218/8378 [00:03<00:05, 1082.86it/s]

⚠️ Nessuna immagine per 00430_L.xml in ../../images_blocs_de_texte\DVD1


RIMES DVD1_TIF:  53%|█████▎    | 4428/8378 [00:05<00:03, 1028.18it/s]

⚠️ Nessuna immagine per 00885_L.xml in ../../images_blocs_de_texte\DVD1


RIMES DVD1_TIF:  81%|████████▏ | 6816/8378 [00:08<00:01, 1070.27it/s]

⚠️ Nessuna immagine per 01405_L.xml in ../../images_blocs_de_texte\DVD1


RIMES DVD1_TIF: 100%|██████████| 8378/8378 [00:12<00:00, 669.39it/s] 
RIMES DVD2_TIF:  85%|████████▍ | 7072/8349 [00:08<00:01, 1037.21it/s]

⚠️ Nessuna immagine per 03368_L.xml in ../../images_blocs_de_texte\DVD2


RIMES DVD2_TIF: 100%|██████████| 8349/8349 [00:10<00:00, 796.51it/s] 
RIMES DVD3_TIF:   3%|▎         | 252/8494 [00:00<00:09, 831.22it/s]

⚠️ Nessuna immagine per 03726_L.xml in ../../images_blocs_de_texte\DVD3


RIMES DVD3_TIF: 100%|██████████| 8494/8494 [00:12<00:00, 682.05it/s]

✅ Merge IAM + RIMES completato in: ../../merged_dataset





In [23]:
import os
from collections import Counter
from PIL import Image
from tqdm import tqdm

merged_root = "../../merged_dataset"

writers = [d for d in os.listdir(merged_root) if os.path.isdir(os.path.join(merged_root, d))]

num_writers = len(writers)
print(f"✍️ Numero writer totali: {num_writers}")

images_per_writer = Counter()
image_sizes = Counter()

total_images = 0
for writer in tqdm(writers, desc="Analisi dataset"):
    writer_dir = os.path.join(merged_root, writer)
    imgs = [f for f in os.listdir(writer_dir) if f.lower().endswith((".jpg", ".png"))]
    images_per_writer[writer] = len(imgs)
    total_images += len(imgs)

    # opzionale: controllo dimensioni solo su un sottoinsieme (per velocità)
    for f in imgs[:10]:
        try:
            img = Image.open(os.path.join(writer_dir, f))
            image_sizes[img.size] += 1
            img.close()
        except Exception as e:
            print(f"Errore con {f}: {e}")

print(f"🖼️ Immagini totali: {total_images}")
print(f"📊 Min immagini per writer: {min(images_per_writer.values())}")
print(f"📊 Max immagini per writer: {max(images_per_writer.values())}")
print(f"📊 Media immagini per writer: {total_images / num_writers:.2f}")

# Mostra le 10 dimensioni più frequenti
print("\n🔝 Top 10 dimensioni immagini:")
for (w, h), c in image_sizes.most_common(10):
    print(f"- {w}x{h}: {c} immagini")


✍️ Numero writer totali: 2067


Analisi dataset: 100%|██████████| 2067/2067 [01:19<00:00, 26.09it/s]

🖼️ Immagini totali: 7139
📊 Min immagini per writer: 1
📊 Max immagini per writer: 59
📊 Media immagini per writer: 3.45

🔝 Top 10 dimensioni immagini:
- 2479x2092: 1488 immagini
- 2171x1130: 3 immagini
- 2471x2081: 2 immagini
- 1935x1208: 2 immagini
- 2427x1366: 2 immagini
- 2236x1234: 2 immagini
- 2163x1135: 2 immagini
- 2274x1801: 2 immagini
- 2301x1489: 2 immagini
- 2270x836: 2 immagini





In [21]:
import os
import shutil
import cv2
from tqdm import tqdm

# ------------------------
# Percorsi
# ------------------------
iam_root = "../../IAMdataset"           # IAM con sottocartelle numeriche
output_root = "../../IAM_blocchi"      # Cartella destinazione
os.makedirs(output_root, exist_ok=True)

# ------------------------
# Parametri di ritaglio
# ------------------------
crop_top = 650      # numero di pixel da tagliare sopra
crop_bottom = 800   # numero di pixel da tagliare sotto

# ------------------------
# Ciclo sulle sottocartelle degli autori
# ------------------------
for author_id in tqdm(os.listdir(iam_root), desc="IAM Autori"):
    author_dir = os.path.join(iam_root, author_id)
    if not os.path.isdir(author_dir):
        continue

    out_author_dir = os.path.join(output_root, author_id)
    os.makedirs(out_author_dir, exist_ok=True)

    for fname in os.listdir(author_dir):
        if not fname.lower().endswith((".png", ".jpg", ".tif")):
            continue

        # Manteniamo il nome originale dell'immagine
        src_file = os.path.join(author_dir, fname)
        dst_file = os.path.join(out_author_dir, fname)

        # Leggi immagine
        img = cv2.imread(src_file)
        if img is None:
            continue

        # Altezza originale
        h = img.shape[0]

        # Ritaglio verticale
        cropped = img[crop_top:h-crop_bottom, :]

        # Salva nella stessa struttura cartelle
        cv2.imwrite(dst_file, cropped)

print("✅ Ritaglio IAM completato mantenendo struttura cartelle!")


IAM Autori: 100%|██████████| 657/657 [08:11<00:00,  1.34it/s]

✅ Ritaglio IAM completato mantenendo struttura cartelle!



