In [1]:
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copy2

# ===== SETTINGS =====
RAW_DIR = "cropped"            # input folder with original images
AUG_DIR = "augmented"      # output folder (will contain originals + augmented)
TARGET = 120                      # minimum number of images per folder
IMG_SIZE = (224, 224)             # resize target

# Subtle Data Augmentation settings (good for face datasets)
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    zoom_range=0.1,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# ===== PROCESS EACH CHARACTER FOLDER =====
summary = {}

for character in os.listdir(RAW_DIR):
    raw_path = os.path.join(RAW_DIR, character)
    save_path = os.path.join(AUG_DIR, character)
    os.makedirs(save_path, exist_ok=True)

    if not os.path.isdir(raw_path):
        continue

    # Step 1: Copy all original images
    originals = []
    for file_name in os.listdir(raw_path):
        if file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.jfif', '.webp')):
            src = os.path.join(raw_path, file_name)
            dst = os.path.join(save_path, file_name)
            copy2(src, dst)  # copy original
            originals.append(dst)

    current_count = len(originals)
    print(f"ðŸ“‚ {character}: {current_count} original images")

    # Step 2: If fewer than TARGET, generate augmentations
    if current_count < TARGET:
        need = TARGET - current_count
        print(f"   âž• Generating {need} augmentations...")

        # loop through originals and create augmentations
        i = 0
        while current_count < TARGET:
            img_path = originals[i % len(originals)]
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, IMG_SIZE)
            img = np.expand_dims(img, axis=0)

            # generate augmented images
            for batch in datagen.flow(img, batch_size=1, save_to_dir=save_path,
                                      save_prefix=f"aug_{i}", save_format='jpg'):
                current_count += 1
                i += 1
                if current_count >= TARGET:
                    break

    summary[character] = current_count
    print(f"   âœ… Final count for {character}: {current_count}")

# ===== SUMMARY =====
print("\nðŸ“Š Final Dataset Summary:")
for k, v in summary.items():
    print(f"   {k}: {v} images")

print("\nðŸŽ‰ Augmentation completed! Every folder has at least 120 images now.")


ðŸ“‚ arya: 36 original images
   âž• Generating 84 augmentations...
   âœ… Final count for arya: 120
ðŸ“‚ brandon: 26 original images
   âž• Generating 94 augmentations...
   âœ… Final count for brandon: 120
ðŸ“‚ catlynstark: 19 original images
   âž• Generating 101 augmentations...
   âœ… Final count for catlynstark: 120
ðŸ“‚ cersi: 20 original images
   âž• Generating 100 augmentations...
   âœ… Final count for cersi: 120
ðŸ“‚ jamie: 14 original images
   âž• Generating 106 augmentations...
   âœ… Final count for jamie: 120
ðŸ“‚ joffery: 17 original images
   âž• Generating 103 augmentations...
   âœ… Final count for joffery: 120
ðŸ“‚ johnsnow: 12 original images
   âž• Generating 108 augmentations...
   âœ… Final count for johnsnow: 120
ðŸ“‚ masterluwin: 17 original images
   âž• Generating 103 augmentations...
   âœ… Final count for masterluwin: 120
ðŸ“‚ nedstark: 42 original images
   âž• Generating 78 augmentations...
   âœ… Final count for nedstark: 120
ðŸ“‚ rackon: 9 original i

In [10]:
import os

# Path to your augmented images folder
dataset_path = "augmented_images"

# Loop through each character folder
for character in os.listdir(dataset_path):
    character_path = os.path.join(dataset_path, character)
    if os.path.isdir(character_path):
        # Count only image files (common formats)
        image_count = len([f for f in os.listdir(character_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        print(f"Character '{character}': {image_count} images")


Character 'arya': 118 images
Character 'brandon': 120 images
Character 'catlynstark': 118 images
Character 'cersi': 120 images
Character 'jamie': 120 images
Character 'joffery': 120 images
Character 'johnsnow': 120 images
Character 'masterluwin': 119 images
Character 'nedstark': 120 images
Character 'rackon': 119 images
Character 'robert': 119 images
Character 'robstark': 120 images
Character 'rodrikcassel': 119 images
Character 'sansa': 120 images
Character 'theon': 119 images


In [2]:
import os
import shutil
import random

# Paths
source_dir = "augmented"
target_dir = "dataset"

# Split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Create target folders
for split in ['train', 'val', 'test']:
    split_path = os.path.join(target_dir, split)
    os.makedirs(split_path, exist_ok=True)

# Process each character folder
for character in os.listdir(source_dir):
    char_path = os.path.join(source_dir, character)
    if os.path.isdir(char_path):
        images = [f for f in os.listdir(char_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        random.shuffle(images)  # Shuffle before splitting

        # Calculate split sizes
        total = len(images)
        train_end = int(total * train_ratio)
        val_end = train_end + int(total * val_ratio)

        splits = {
            'train': images[:train_end],
            'val': images[train_end:val_end],
            'test': images[val_end:]
        }

        # Copy images to target split folders
        for split, split_images in splits.items():
            split_char_dir = os.path.join(target_dir, split, character)
            os.makedirs(split_char_dir, exist_ok=True)
            for img_file in split_images:
                shutil.copy(os.path.join(char_path, img_file), os.path.join(split_char_dir, img_file))

        print(f"{character}: {len(splits['train'])} train, {len(splits['val'])} val, {len(splits['test'])} test")


arya: 83 train, 23 val, 13 test
brandon: 84 train, 24 val, 12 test
catlynstark: 83 train, 23 val, 13 test
cersi: 84 train, 24 val, 12 test
jamie: 84 train, 24 val, 12 test
joffery: 84 train, 24 val, 12 test
johnsnow: 83 train, 23 val, 13 test
masterluwin: 84 train, 24 val, 12 test
nedstark: 84 train, 24 val, 12 test
rackon: 82 train, 23 val, 13 test
robert: 84 train, 24 val, 12 test
robstark: 84 train, 24 val, 12 test
rodrikcassel: 84 train, 24 val, 12 test
sansa: 82 train, 23 val, 13 test
theon: 84 train, 24 val, 12 test
