In [5]:
import os
import shutil
import random
import cv2

# You will need to download the dataset from: https://www.kaggle.com/datasets/alexattia/the-simpsons-characters-dataset 
# make sure it's in the correct folder structure as expected by the script.

BASE_PATH = "homer/simpsons_dataset"
ANNOTATION_FILE = "homer/annotation.txt"
OUTPUT_PATH = "homer/project"

CHARACTERS = {
    "homer_simpson": 0,
    "principal_skinner": 1,
    "marge_simpson": 1,
    "lisa_simpson": 1,
    "bart_simpson": 1,
    "chief_wiggum": 1
}

HOMER_TARGET = 500
NON_HOMER_PER_CLASS = 100
TRAIN_RATIO = 0.8

# Create directory structure
for split in ["train", "val"]:
    os.makedirs(os.path.join(OUTPUT_PATH, "images", split), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_PATH, "labels", split), exist_ok=True)
print("Directory structure created.")

# Load annotations - Group by character
annotations_by_character = {char: [] for char in CHARACTERS.keys()}

with open(ANNOTATION_FILE, "r") as f:
    for line in f:
        parts = line.strip().split(",")
        if len(parts) < 6:
            continue
        img_path = parts[0]  # Keep original path
        x1, y1, x2, y2 = map(int, parts[1:5])
        cls_name = parts[5]
        
        # Only keep annotations for characters we care about
        if cls_name in CHARACTERS:
            # Store: (image_path, x1, y1, x2, y2)
            annotations_by_character[cls_name].append((img_path, x1, y1, x2, y2))

print(f"Annotations loaded:")
for char, annots in annotations_by_character.items():
    print(f"  {char}: {len(annots)} annotations")

# Helper: YOLO conversion
def to_yolo(x1, y1, x2, y2, w_img, h_img):
    w = x2 - x1
    h = y2 - y1
    cx = x1 + w / 2
    cy = y1 + h / 2
    return cx / w_img, cy / h_img, w / w_img, h / h_img

selected_annotations = []

# Homer annotations
homer_annots = annotations_by_character["homer_simpson"]
if len(homer_annots) < HOMER_TARGET:
    raise ValueError(f"Not enough Homer annotations. Found {len(homer_annots)}, need {HOMER_TARGET}")
selected_homer = random.sample(homer_annots, HOMER_TARGET)
selected_annotations.extend([("homer_simpson", ann) for ann in selected_homer])

# Non-Homer annotations
for char in ["principal_skinner", "marge_simpson", "lisa_simpson", "bart_simpson", "chief_wiggum"]:
    char_annots = annotations_by_character[char]
    if len(char_annots) < NON_HOMER_PER_CLASS:
        raise ValueError(f"Not enough {char} annotations. Found {len(char_annots)}, need {NON_HOMER_PER_CLASS}")
    selected = random.sample(char_annots, NON_HOMER_PER_CLASS)
    selected_annotations.extend([(char, ann) for ann in selected])

# Shuffle all selected annotations
random.shuffle(selected_annotations)

print(f"\nSelected {len(selected_annotations)} total annotations")
print(f"  Homer: {len(selected_homer)}")
for char in ["principal_skinner", "marge_simpson", "lisa_simpson", "bart_simpson", "chief_wiggum"]:
    count = sum(1 for c, _ in selected_annotations if c == char)
    print(f"  {char}: {count}")

# Verify annotations
verification_passed = True

for char_name, (img_path, x1, y1, x2, y2) in selected_annotations:
    # Convert annotation path to actual file path
    actual_path = img_path.replace("./characters/", BASE_PATH + "/")
    
    if not os.path.exists(actual_path):
        print(f"Image not found: {actual_path}")
        verification_passed = False
if verification_passed:
    print("\nAll annotations verified successfully!")
else:
    print("Verification failed. Check errors above.")

# Split into train/val & save
split_idx = int(len(selected_annotations) * TRAIN_RATIO)
train_annotations = selected_annotations[:split_idx]
val_annotations = selected_annotations[split_idx:]

# Group annotations by image for processing
def process_split(annotation_list, split_name):
    # Group by image path
    images_dict = {}
    for char_name, (img_path, x1, y1, x2, y2) in annotation_list:
        if img_path not in images_dict:
            images_dict[img_path] = []
        images_dict[img_path].append((char_name, x1, y1, x2, y2))
    
    # Track image numbers per character
    character_counters = {char: 1 for char in CHARACTERS.keys()}
    
    saved_images = 0
    for img_path, annots in images_dict.items():
        # Convert path
        src_img = img_path.replace("./characters/", BASE_PATH + "/")
        
        # Get primary character (first annotation for this image)
        primary_char = annots[0][0]
        
        # Get file extension
        ext = os.path.splitext(img_path)[1]
        
        # Create new filename with character name and number
        new_filename = f"{primary_char}_{character_counters[primary_char]:04d}{ext}"
        character_counters[primary_char] += 1
        
        dst_img = os.path.join(OUTPUT_PATH, "images", split_name, new_filename)
        
        # Copy image
        shutil.copy2(src_img, dst_img)
        
        # Read image dimensions
        img = cv2.imread(src_img)
        h_img, w_img = img.shape[:2]
        
        # Create YOLO label file
        label_filename = os.path.splitext(new_filename)[0] + ".txt"
        label_path = os.path.join(OUTPUT_PATH, "labels", split_name, label_filename)
        
        # Write all annotations for this image
        with open(label_path, "w") as f:
            for char_name, x1, y1, x2, y2 in annots:
                class_id = CHARACTERS[char_name]
                x_center, y_center, width, height = to_yolo(x1, y1, x2, y2, w_img, h_img)
                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")
        
        saved_images += 1

process_split(train_annotations, "train")
process_split(val_annotations, "val")

print("\nFINAL SUMMARY ------------")
print(f"Homer images: {HOMER_TARGET}")
print(f"Non-Homer images (per character): {NON_HOMER_PER_CLASS}")
print(f"Total non-Homer images: {NON_HOMER_PER_CLASS * 5}")
print(f"Total images: {HOMER_TARGET + NON_HOMER_PER_CLASS * 5}")

Directory structure created.
Annotations loaded:
  homer_simpson: 612 annotations
  principal_skinner: 506 annotations
  marge_simpson: 557 annotations
  lisa_simpson: 562 annotations
  bart_simpson: 554 annotations
  chief_wiggum: 209 annotations

Selected 1000 total annotations
  Homer: 500
  principal_skinner: 100
  marge_simpson: 100
  lisa_simpson: 100
  bart_simpson: 100
  chief_wiggum: 100

All annotations verified successfully!

FINAL SUMMARY ------------
Homer images: 500
Non-Homer images (per character): 100
Total non-Homer images: 500
Total images: 1000
