# Step 2: Data augmentation
Implement in src/augment.py later

### Apply techniques like:
- Rotation
- Horizontal/Vertical flipping
- Scaling / Cropping
- Color jitter / brightness adjustment
### Target: Increase dataset by at least 30%, especially for minority classes.
- Save augmented images or implement augmentation on-the-fly during training.

> after exploring data in the prev notebook and removing courpted images and counting no. of images for each class we found that
### Images per class:
- cardboard: 259 images
- glass: 401 images
- metal: 328 images
- paper: 476 images
- plastic: 386 images
- trash: 110 images

# Apply Data Augmentation Using ImageDataGenerator (Keras)
> documentation : https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
> video: https://www.youtube.com/watch?v=Ahy50JCRYNk

- `ImageDataGenerator`: Class used to apply data augmentation to images.
- `img_to_array`: Converts a PIL image to a NumPy array (needed for processing).
- `load_img`: Loads an image from disk as a PIL image.

In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import os

> “Every time you load an image, randomly rotate it, shift it, zoom it, or flip it.”
| Parameter                    | Meaning                                     |
| ---------------------------- | ------------------------------------------- |
| `rotation_range=20`          | Rotate image randomly between -20° to +20°  |
| `width_shift_range=0.1`      | Move image left/right up to 10% of width    |
| `height_shift_range=0.1`     | Move image up/down up to 10% of height      |
| `zoom_range=0.1`             | Randomly zoom in/out up to ±10%             |
| `horizontal_flip=True`       | Flip image horizontally (mirror)            |
| `brightness_range=[0.8,1.2]` | Randomly change brightness from 80% to 120% |


In [10]:
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]
)

# 2. Function to augment a single class folder


In [11]:
IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png')

def augment_class(input_dir, output_dir, augment_count=5):
    os.makedirs(output_dir, exist_ok=True)

    for image_name in os.listdir(input_dir):
        if not image_name.lower().endswith(IMAGE_EXTENSIONS):
            print(f"⏭ Skipping non-image file: {image_name}")
            continue

        img_path = os.path.join(input_dir, image_name)
        try:
            img = load_img(img_path)
        except:
            print("skipping corrupted file:", img_path)
            continue

        x = img_to_array(img)
        x = x.reshape((1,) + x.shape)

        prefix = image_name.split('.')[0]
        i = 0
        for batch in datagen.flow(
            x,
            batch_size=1,
            save_to_dir=output_dir,
            save_prefix=prefix,
            save_format='jpg'
        ):
            i += 1
            if i >= augment_count:
                break

# 3. Function to augment the entire dataset


In [12]:
def augment_dataset(root="/Users/rodynaamr/Image_Classification_SVM_kNN/data", augment_count=10):
    # List all items in root
    classes = os.listdir(root)

    for cls in classes:
        # Skip hidden files and folders (like .DS_Store)
        if cls.startswith('.'):
            continue
        # Skip already augmented folders
        if cls.endswith('_aug'):
            continue

        input_dir = os.path.join(root, cls)
        output_dir = os.path.join(root, cls + "_aug")
        print(f"Augmenting class {cls}...")
        augment_class(input_dir, output_dir, augment_count=augment_count)


# 4. Run augmentation


In [13]:
augment_dataset(root="/Users/rodynaamr/Image_Classification_SVM_kNN/data", augment_count=10)


Augmenting class paper...
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/e9ddd8f5-85d6-45e0-b607-104259ca6fbb.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/3b5bb1a4-7819-4167-a610-5e35c6f4b074.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/1f1f1143-61aa-45c4-b6b2-1908d5ea9c34.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/47b3d05f-860e-43dc-87ff-8cd4b17edb34.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/122d1fdd-5d5b-4b25-a7cd-a7fb688cb448.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/e7e7ee96-4aed-448e-beb6-22a40e70a780.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/2fea66f0-97bd-44f2-813b-5fb10a360900.jpg
skipping corrupted file: /Users/rodynaamr/Image_Classification_SVM_kNN/data/paper/1e75dd85-5680-448c-a661-ebf50fc

In [14]:

import os
for waste in ["cardboard", "glass", "metal", "paper", "plastic", "trash"]:
    count = 0
    for suffix in ["", "_aug"]:
        folder = f"/Users/rodynaamr/Image_Classification_SVM_kNN/data/{waste}{suffix}"
        if os.path.exists(folder):
            count += len([f for f in os.listdir(folder) if f.endswith(('.png', '.jpg', '.jpeg'))])
    print(f"{waste}: {count}")

cardboard: 2729
glass: 4249
metal: 3477
paper: 4965
plastic: 4014
trash: 1169
