<a href="https://colab.research.google.com/github/shrabonbiswas/Thesis/blob/main/Split%20and%20augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📌 Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# 📌 Step 2: Copy zip from Drive to Colab content
!cp "/content/drive/MyDrive/Thesis_Update/preprocessed/Maize_Preprocessed.zip" "/content/Maize_Preprocessed.zip"


In [3]:
# 📌 Step 3: Extract ZIP in content
import zipfile
zip_path = "/content/Maize_Preprocessed.zip"
extract_dir = "/content/Maize_Preprocessed"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [4]:
# Step 4: Create dataframe from image paths
import pandas as pd
from glob import glob
from sklearn.model_selection import train_test_split
import os

image_paths = glob(f'{'/content/Maize_Preprocessed/dataset'}/*/*.*')
data = pd.DataFrame({
    'filepath': image_paths,
    'label': [os.path.basename(os.path.dirname(p)) for p in image_paths]
})

# Stratified Split
train_df, temp_df = train_test_split(data, test_size=0.3, stratify=data['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)


In [5]:
# Step 5: Copy images to split folders
import shutil
from tqdm import tqdm

def copy_images(df, split_name, base_dir='/content/split'):
    for _, row in tqdm(df.iterrows(), total=len(df)):
        class_dir = os.path.join(base_dir, split_name, row['label'])
        os.makedirs(class_dir, exist_ok=True)
        shutil.copy(row['filepath'], class_dir)

copy_images(train_df, 'train')
copy_images(val_df, 'val')
copy_images(test_df, 'test')


100%|██████████| 2931/2931 [00:04<00:00, 706.93it/s] 
100%|██████████| 628/628 [00:00<00:00, 2644.83it/s]
100%|██████████| 629/629 [00:00<00:00, 1903.02it/s]


In [None]:
augmentation

In [7]:
import os
import cv2
import albumentations as A
from tqdm import tqdm
import random

# ✅ Base directory of your extracted split dataset
base_train_dir = '/content/split/train'

# ✅ Desired target per class
TARGET_PER_CLASS = 2000

# ✅ Augmentation pipeline
augment = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.3),
    A.Rotate(limit=30, p=0.5),
    A.RandomShadow(p=0.3),
    A.ZoomBlur(p=0.2),
   A.ShiftScaleRotate(p=0.3)
])


  original_init(self, **validated_kwargs)


In [10]:
import os

dataset_path = '/content/split/train'

if not os.path.exists(dataset_path):
    print(f"❌ Dataset path not found: {dataset_path}")
else:
    class_counts = {}
    for class_name in os.listdir(dataset_path):
        class_dir = os.path.join(dataset_path, class_name)
        if os.path.isdir(class_dir):
            num_images = len([
                f for f in os.listdir(class_dir)
                if os.path.isfile(os.path.join(class_dir, f))
            ])
            class_counts[class_name] = num_images

    print(f"\n✅ Number of classes: {len(class_counts)}")
    print("📊 Images per class:")
    for cls, count in class_counts.items():
        print(f"  {cls}: {count} images")


✅ Number of classes: 4
📊 Images per class:
  Gray_Leaf_Spot: 2000 images
  Healthy: 2000 images
  Common_Rust: 2000 images
  Blight: 2000 images


In [11]:
# ✅ Apply augmentation only to classes with fewer than target images
for class_name in os.listdir(base_train_dir):
    class_path = os.path.join(base_train_dir, class_name)
    images = os.listdir(class_path)
    current_count = len(images)

    print(f"📁 {class_name}: {current_count} images")

    if current_count >= TARGET_PER_CLASS:
        continue  # Skip if already enough

    to_generate = TARGET_PER_CLASS - current_count
    img_paths = [os.path.join(class_path, img) for img in images]

    for i in tqdm(range(to_generate), desc=f"🔄 Augmenting {class_name}"):
        img_path = random.choice(img_paths)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        augmented = augment(image=img)['image']
        save_path = os.path.join(class_path, f'aug_{i}_{os.path.basename(img_path)}')
        cv2.imwrite(save_path, cv2.cvtColor(augmented, cv2.COLOR_RGB2BGR))

📁 Gray_Leaf_Spot: 2000 images
📁 Healthy: 2000 images
📁 Common_Rust: 2000 images
📁 Blight: 2000 images
