In [1]:
import os
import xml.etree.ElementTree as ET
from PIL import Image
from tqdm import tqdm

#1. Konfiguration & Pfade ---
BASE_DIR = "/mnt/c/Users/sandr/Desktop/Studium/thesis/praxis/data/plantdoc"
TRAIN_DIR = os.path.join(BASE_DIR, "TRAIN") 
TEST_DIR = os.path.join(BASE_DIR, "TEST")

# Hier sollen die neuen Bilder hin
OUTPUT_DIR = "/mnt/c/Users/sandr/Desktop/Studium/thesis/praxis/data/plantdoc_cropped"

# 2. Klassen-Mapping (PlantDoc -> PlantVillage) ---
CLASS_MAPPING = {
    # Apple
    'Apple Scab Leaf': 'Apple___Apple_scab',
    'Apple leaf': 'Apple___healthy',
    'Apple rust leaf': 'Apple___Cedar_apple_rust',
    
    # Bell Pepper
    'Bell_pepper leaf': 'Pepper,_bell___healthy',
    'Bell_pepper leaf spot': 'Pepper,_bell___Bacterial_spot',
    
    # Blueberry
    'Blueberry leaf': 'Blueberry___healthy',
    
    # Cherry
    'Cherry leaf': 'Cherry___healthy',
    
    # Corn
    'Corn Gray leaf spot': 'Corn___Cercospora_leaf_spot Gray_leaf_spot',
    'Corn leaf blight': 'Corn___Northern_Leaf_Blight',
    'Corn rust leaf': 'Corn___Common_rust',
    
    # Grape
    'grape leaf': 'Grape___healthy',
    'grape leaf black rot': 'Grape___Black_rot',
    
    # Peach
    'Peach leaf': 'Peach___healthy',
    
    # Potato
    'Potato leaf early blight': 'Potato___Early_blight',
    'Potato leaf late blight': 'Potato___Late_blight',
    
    # Raspberry
    'Raspberry leaf': 'Raspberry___healthy',
    
    # Soybean
    'Soyabean leaf': 'Soybean___healthy',
    
    # Squash
    'Squash Powdery mildew leaf': 'Squash___Powdery_mildew',
    
    # Strawberry
    'Strawberry leaf': 'Strawberry___healthy',
    
    # Tomato
    'Tomato Early blight leaf': 'Tomato___Early_blight',
    'Tomato Septoria leaf spot': 'Tomato___Septoria_leaf_spot',
    'Tomato leaf': 'Tomato___healthy',
    'Tomato leaf bacterial spot': 'Tomato___Bacterial_spot',
    'Tomato leaf late blight': 'Tomato___Late_blight',
    'Tomato leaf mosaic virus': 'Tomato___Tomato_mosaic_virus',
    'Tomato leaf yellow virus': 'Tomato___Tomato_Yellow_Leaf_Curl_Virus',
    'Tomato mold leaf': 'Tomato___Leaf_Mold',
}

# 3. Setup
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4. Die Verarbeitungs-Funktion
def process_xml_folder(source_dir, output_dir, dataset_type="Dataset"):
    
    if not os.path.exists(source_dir):
        print(f"FEHLER: Quellordner nicht gefunden: {source_dir}")
        return

    # Suche alle XML Dateien im Ordner
    all_files = os.listdir(source_dir)
    xml_files = [f for f in all_files if f.lower().endswith('.xml')]
    
    print(f"Starte Verarbeitung von {dataset_type}: {len(xml_files)} XML-Dateien gefunden.")
    
    # Zähler für Statistiken
    success_count = 0
    skip_mapping_count = 0
    skip_missing_img_count = 0
    skip_invalid_box_count = 0
    
    # Dictionary um Namenskonflikte zu lösen
    crop_counter = {}

    for xml_file in tqdm(xml_files, desc=f"Verarbeite {dataset_type}"):
        xml_path = os.path.join(source_dir, xml_file)
        
        try:
            # XML Parsen
            tree = ET.parse(xml_path)
            root = tree.getroot()
            
            # Wir nehmen den Basisnamen der XML und suchen nach .jpg oder .JPG
            basename = os.path.splitext(xml_file)[0]
            
            # Versuche Bild zu finden (jpg oder JPG)
            image_name = f"{basename}.jpg"
            src_img_path = os.path.join(source_dir, image_name)
            
            if not os.path.exists(src_img_path):
                # Fallback für Großschreibung oder png
                possible_extensions = ['.JPG', '.png', '.PNG', '.jpeg']
                found = False
                for ext in possible_extensions:
                    temp_path = os.path.join(source_dir, basename + ext)
                    if os.path.exists(temp_path):
                        src_img_path = temp_path
                        image_name = basename + ext
                        found = True
                        break
                
                if not found:
                    skip_missing_img_count += 1
                    continue

            # Bild laden
            with Image.open(src_img_path) as img:
                img = img.convert('RGB')
                w_img, h_img = img.size
                
                # Durch alle <object> Tags iterieren
                for obj in root.findall('object'):
                    classname = obj.find('name').text
                    
                    # 1. Prüfen ob Klasse im Mapping ist
                    if classname not in CLASS_MAPPING:
                        skip_mapping_count += 1
                        continue
                    
                    target_class_folder = CLASS_MAPPING[classname]
                    
                    # Bounding Box extrahieren
                    bndbox = obj.find('bndbox')
                    xmin = int(float(bndbox.find('xmin').text))
                    ymin = int(float(bndbox.find('ymin').text))
                    xmax = int(float(bndbox.find('xmax').text))
                    ymax = int(float(bndbox.find('ymax').text))
                    
                    # Koordinaten clampen
                    xmin = max(0, xmin)
                    ymin = max(0, ymin)
                    xmax = min(w_img, xmax)
                    ymax = min(h_img, ymax)
                    
                    # 2. Prüfen ob Box gültig ist
                    if (xmax - xmin) <= 0 or (ymax - ymin) <= 0:
                        skip_invalid_box_count += 1
                        continue

                    # Zielordner erstellen
                    class_dir = os.path.join(output_dir, target_class_folder)
                    os.makedirs(class_dir, exist_ok=True)
                    
                    # Crop erstellen
                    crop = img.crop((xmin, ymin, xmax, ymax))
                    
                    # 3. Namenskonflikt lösen
                    if image_name not in crop_counter:
                        crop_counter[image_name] = 0
                    crop_counter[image_name] += 1
                    
                    # Dateiname bereinigen
                    name_part = os.path.splitext(image_name)[0]
                    name_part = name_part.replace(" ", "_").replace("+", "")
                    
                    new_filename = f"{name_part}_crop_{crop_counter[image_name]}.jpg"
                    save_path = os.path.join(class_dir, new_filename)
                    
                    crop.save(save_path, quality=95)
                    success_count += 1

        except Exception as e:
            print(f"Kritischer Fehler bei {xml_file}: {e}")

    # Zusammenfassung
    print(f"--- Abschlussbericht {dataset_type} ---")
    print(f"Crops erfolgreich erstellt: {success_count}")
    print(f"Objekte übersprungen (Mapping fehlt): {skip_mapping_count}")
    print(f"Dateien übersprungen (Bild fehlt): {skip_missing_img_count}")
    print(f"Boxen übersprungen (Ungültige Größe): {skip_invalid_box_count}")
    print("-" * 30)

# 5. Ausführen
process_xml_folder(TRAIN_DIR, OUTPUT_DIR, dataset_type="TRAIN")
process_xml_folder(TEST_DIR, OUTPUT_DIR, dataset_type="TEST")

Starte Verarbeitung von TRAIN: 2346 XML-Dateien gefunden.


Verarbeite TRAIN: 100%|█████████████████████████████████████████████████████████████| 2346/2346 [05:13<00:00,  7.48it/s]


--- Abschlussbericht TRAIN ---
Crops erfolgreich erstellt: 8425
Objekte übersprungen (Mapping fehlt): 13
Dateien übersprungen (Bild fehlt): 2
Boxen übersprungen (Ungültige Größe): 2
------------------------------
Starte Verarbeitung von TEST: 237 XML-Dateien gefunden.


Verarbeite TEST: 100%|████████████████████████████████████████████████████████████████| 237/237 [00:22<00:00, 10.74it/s]

--- Abschlussbericht TEST ---
Crops erfolgreich erstellt: 452
Objekte übersprungen (Mapping fehlt): 0
Dateien übersprungen (Bild fehlt): 0
Boxen übersprungen (Ungültige Größe): 0
------------------------------



