In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import cv2
import shutil
import numpy as np
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import random

In [33]:
# Define paths in Google Drive
input_dir = "/content/drive/My Drive/Research/DatasetCropped"  # Path to cropped buds
output_dir = "/content/drive/My Drive/Research/DatasetAugmented"  # Path to save augmented dataset

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

#print image counts
!echo "v1:" && find "/content/drive/My Drive/Research/DatasetCropped/v1" -type f | wc -l
!echo "v2:" && find "/content/drive/My Drive/Research/DatasetCropped/v2" -type f | wc -l
!echo "v3:" && find "/content/drive/My Drive/Research/DatasetCropped/v3" -type f | wc -l
!echo "v4:" && find "/content/drive/My Drive/Research/DatasetCropped/v4" -type f | wc -l



v1:
230
v2:
230
v3:
230
v4:
230


In [34]:
# Augmentation pipeline
augmentations = A.Compose([
    A.HorizontalFlip(p=0.5),  # Flip horizontally
    A.RandomBrightnessContrast(p=0.3),  # Adjust brightness/contrast
    A.GaussianBlur(blur_limit=(3, 7), p=0.3),  # Add noise
    A.HueSaturationValue(p=0.3),  # Slight color shift
    A.GaussianBlur(blur_limit=(3, 7), p=0.3),  # Add blur
    A.Sharpen(p=0.3),  # Sharpen edges
])

In [35]:
# Augment dataset
num_augments = 5  # Number of augmented versions per image
for subfolder in os.listdir(input_dir):
    subfolder_path = os.path.join(input_dir, subfolder)
    output_subfolder_path = os.path.join(output_dir, subfolder)
    os.makedirs(output_subfolder_path, exist_ok=True)

    for img_name in tqdm(os.listdir(subfolder_path)):
        img_path = os.path.join(subfolder_path, img_name)
        image = cv2.imread(img_path)
        if image is None:
            continue

        # Save original
        shutil.copy(img_path, os.path.join(output_subfolder_path, img_name))

        # Generate augmented images
        for i in range(num_augments):
            augmented = augmentations(image=image)['image']
            aug_img_name = f"{os.path.splitext(img_name)[0]}_aug{i}.jpg"
            cv2.imwrite(os.path.join(output_subfolder_path, aug_img_name), augmented)

print("Data augmentation completed!")
!echo "v1:" && find "/content/drive/My Drive/Research/DatasetAugmented/v1" -type f | wc -l
!echo "v2:" && find "/content/drive/My Drive/Research/DatasetAugmented/v2" -type f | wc -l
!echo "v3:" && find "/content/drive/My Drive/Research/DatasetAugmented/v3" -type f | wc -l
!echo "v4:" && find "/content/drive/My Drive/Research/DatasetAugmented/v4" -type f | wc -l

100%|██████████| 230/230 [00:37<00:00,  6.08it/s]
100%|██████████| 230/230 [00:38<00:00,  5.96it/s]
100%|██████████| 230/230 [00:42<00:00,  5.44it/s]
100%|██████████| 230/230 [00:41<00:00,  5.48it/s]


Data augmentation completed!
v1:
1380
v2:
1380
v3:
1380
v4:
1380


In [39]:
# Splitting the dataset into train/val/test
base_dir = "/content/drive/My Drive/Research/DatasetAugmented"
output_dir = "/content/drive/My Drive/Research/DatasetAugmented_Split"

train_dir = os.path.join(output_dir, "train")
val_dir = os.path.join(output_dir, "val")
test_dir = os.path.join(output_dir, "test")

# Create train/val/test folders
for split_dir in [train_dir, val_dir, test_dir]:
    os.makedirs(split_dir, exist_ok=True)

# Get subfolders (v1, v2, v3, v4)
subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

for subfolder in subfolders:
    subfolder_path = os.path.join(base_dir, subfolder)
    images = os.listdir(subfolder_path)
    random.shuffle(images)

    train_split = int(len(images) * 0.7)
    val_split = int(len(images) * 0.85)

    # Create class-wise folders inside train/val/test
    for split_name, split_range in zip(["train", "val", "test"], [(0, train_split), (train_split, val_split), (val_split, len(images))]):
        split_subfolder_path = os.path.join(output_dir, split_name, subfolder)
        os.makedirs(split_subfolder_path, exist_ok=True)

        for i in range(*split_range):
            src_path = os.path.join(subfolder_path, images[i])
            dst_path = os.path.join(split_subfolder_path, images[i])
            shutil.copy(src_path, dst_path)

print("Dataset split completed successfully!")


Dataset split completed successfully!


In [41]:
!echo "Train Set:"
!echo "v1:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/train/v1" -type f | wc -l
!echo "v2:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/train/v2" -type f | wc -l
!echo "v3:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/train/v3" -type f | wc -l
!echo "v4:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/train/v4" -type f | wc -l

!echo "Validation Set:"
!echo "v1:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/val/v1" -type f | wc -l
!echo "v2:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/val/v2" -type f | wc -l
!echo "v3:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/val/v3" -type f | wc -l
!echo "v4:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/val/v4" -type f | wc -l

!echo "Test Set:"
!echo "v1:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/test/v1" -type f | wc -l
!echo "v2:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/test/v2" -type f | wc -l
!echo "v3:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/test/v3" -type f | wc -l
!echo "v4:" && find "/content/drive/My Drive/Research/DatasetAugmented_Split/test/v4" -type f | wc -l

Train Set:
v1:
965
v2:
965
v3:
965
v4:
965
Validation Set:
v1:
208
v2:
208
v3:
208
v4:
208
Test Set:
v1:
207
v2:
207
v3:
207
v4:
207
