In [None]:
import os
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm

# ------------------------
# Cartelle sorgenti
# ------------------------
iam_root = "../../IAM_blocchi"                 # IAM con sottocartelle numeriche
rimes_xml_root = "../../Images_Courriers"    # RIMES XML
rimes_img_root = "../../images_blocs_de_texte"  # RIMES immagini ritagliate

# Cartella di destinazione merged
merged_root = "../../merged_dataset"
os.makedirs(merged_root, exist_ok=True)

# ------------------------
# Copia IAM così com'è
# ------------------------
print("Copio IAM...")
for author_id in tqdm(os.listdir(iam_root), desc="IAM"):
    src_dir = os.path.join(iam_root, author_id)
    if not os.path.isdir(src_dir):
        continue

    dst_dir = os.path.join(merged_root, author_id)
    os.makedirs(dst_dir, exist_ok=True)

    for fname in os.listdir(src_dir):
        src_file = os.path.join(src_dir, fname)
        dst_file = os.path.join(dst_dir, fname)
        if not os.path.exists(dst_file):
            shutil.copy(src_file, dst_file)

# ------------------------
# Funzione per estrarre writer da XML RIMES
# ------------------------
def extract_writer_from_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        writer = root.findtext("writer")  # usa il tag corretto
        if writer is None:
            return "rimes_unknown"
        return "rimes_" + writer.strip().replace(" ", "_")
    except Exception:
        return "rimes_unknown"

# ------------------------
# Copia RIMES
# ------------------------
print("Copio RIMES...")

dvd_map = {
    "DVD1_TIF": "DVD1",
    "DVD2_TIF": "DVD2",
    "DVD3_TIF": "DVD3",
}

for xml_subdir, img_subdir in dvd_map.items():
    xml_dir = os.path.join(rimes_xml_root, xml_subdir)
    if not os.path.exists(xml_dir):
        continue
    
    img_dir = os.path.join(rimes_img_root, img_subdir)
    if not os.path.exists(img_dir):
        continue

    for xml_file in tqdm(os.listdir(xml_dir), desc=f"RIMES {xml_subdir}"):
        if not xml_file.endswith("_L.xml"):  # <-- Salta XML non "L"
            continue

        base_name = os.path.splitext(xml_file)[0]
        xml_path = os.path.join(xml_dir, xml_file)

        writer_id = extract_writer_from_xml(xml_path)

        img_file = base_name + ".jpg"
        img_path = os.path.join(img_dir, img_file)
        if not os.path.exists(img_path):
            print(f"Nessuna immagine per {xml_file} in {img_dir}")
            continue

        # Crea cartella writer e copia immagine
        out_dir = os.path.join(merged_root, writer_id)
        os.makedirs(out_dir, exist_ok=True)
        dst_file = os.path.join(out_dir, img_file)
        if not os.path.exists(dst_file):
            shutil.copy(img_path, dst_file)

print("Merge IAM + RIMES completato in:", merged_root)


In [None]:
import os
import shutil
import cv2
from tqdm import tqdm

# ------------------------
# Percorsi
# ------------------------
iam_root = "../../IAMdataset"           # IAM con sottocartelle numeriche
output_root = "../../IAM_blocchi"      # Cartella destinazione
os.makedirs(output_root, exist_ok=True)

# ------------------------
# Parametri di ritaglio
# ------------------------
crop_top = 650      # numero di pixel da tagliare sopra
crop_bottom = 800   # numero di pixel da tagliare sotto

# ------------------------
# Ciclo sulle sottocartelle degli autori
# ------------------------
for author_id in tqdm(os.listdir(iam_root), desc="IAM Autori"):
    author_dir = os.path.join(iam_root, author_id)
    if not os.path.isdir(author_dir):
        continue

    out_author_dir = os.path.join(output_root, author_id)
    os.makedirs(out_author_dir, exist_ok=True)

    for fname in os.listdir(author_dir):
        if not fname.lower().endswith((".png", ".jpg", ".tif")):
            continue

        # Manteniamo il nome originale dell'immagine
        src_file = os.path.join(author_dir, fname)
        dst_file = os.path.join(out_author_dir, fname)

        # Leggi immagine
        img = cv2.imread(src_file)
        if img is None:
            continue

        # Altezza originale
        h = img.shape[0]

        # Ritaglio verticale
        cropped = img[crop_top:h-crop_bottom, :]

        # Salva nella stessa struttura cartelle
        cv2.imwrite(dst_file, cropped)

print("Ritaglio IAM completato mantenendo struttura cartelle!")
