## Summary of the code
- All images are resized to 224×224 pixels
- Images from the 'extracted_prs' folder are labeled as 'pr' and images from the 'extracted_oth' folder are labeled as 'other. This is handled by organising into sub directories in the test and train folder. The structure for the same is:
train/pr/ ; train/other/
test/pr/; test/other/
- code randomly shuffles the images and splits them with 80% for training and 20% for testing
- Error handling to skip problematic images

In [None]:
import os
import random
import shutil
from PIL import Image
import numpy as np
from tqdm import tqdm

# Define paths
base_dir = '/content/drive/MyDrive/biotech/Retina_Lab/Image_data'
pr_dir = os.path.join(base_dir, 'extracted_prs')
other_dir = os.path.join(base_dir, 'extracted_oth')
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

# Create necessary directories
for directory in [train_dir, test_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Create label subdirectories
    pr_subdir = os.path.join(directory, 'pr')
    other_subdir = os.path.join(directory, 'other')

    if not os.path.exists(pr_subdir):
        os.makedirs(pr_subdir)
    if not os.path.exists(other_subdir):
        os.makedirs(other_subdir)

# Function to resize and process images
def process_and_split_images(source_dir, label, target_size=(224, 224), train_ratio=0.8, seed=42):
    """
    Resize images and split them into train and test sets

    Args:
        source_dir: Directory containing original images
        label: Label for the images ('pr' or 'other')
        target_size: Size to resize images to (default 224x224 for VGG16)
        train_ratio: Proportion of images to use for training
        seed: Random seed for reproducibility
    """
    # Set random seed for reproducibility
    random.seed(seed)

    # Get all image files
    valid_extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff')
    image_files = [f for f in os.listdir(source_dir) if f.lower().endswith(valid_extensions)]

    if not image_files:
        print(f"Warning: No image files found in {source_dir}")
        return 0, 0

    # Shuffle files for random split
    random.shuffle(image_files)

    # Calculate split point
    split_idx = int(len(image_files) * train_ratio)

    # Split into training and testing sets
    train_files = image_files[:split_idx]
    test_files = image_files[split_idx:]

    print(f"Processing {len(image_files)} {label} images: {len(train_files)} for training, {len(test_files)} for testing")

    # Function to process a single image
    def process_image(img_file, destination_dir):
        try:
            img_path = os.path.join(source_dir, img_file)
            img = Image.open(img_path)

            # Convert to RGB if not already (VGG16 expects RGB)
            if img.mode != 'RGB':
                img = img.convert('RGB')

            # Resize image
            img = img.resize(target_size, Image.LANCZOS)

            # Save to destination directory
            target_path = os.path.join(destination_dir, img_file)
            img.save(target_path)
            return True
        except Exception as e:
            print(f"Error processing {img_file}: {e}")
            return False

    # Process training images
    train_success = 0
    for img_file in tqdm(train_files, desc=f"Processing {label} training images"):
        if process_image(img_file, os.path.join(train_dir, label)):
            train_success += 1

    # Process testing images
    test_success = 0
    for img_file in tqdm(test_files, desc=f"Processing {label} testing images"):
        if process_image(img_file, os.path.join(test_dir, label)):
            test_success += 1

    print(f"Successfully processed {train_success}/{len(train_files)} training and {test_success}/{len(test_files)} testing {label} images")

    return train_success, test_success

# Execute the main processing steps
print("Starting image processing and dataset preparation...")

# Process PR images (labeled as 'pr')
pr_train, pr_test = process_and_split_images(pr_dir, 'pr')

# Process Other images (labeled as 'other')
other_train, other_test = process_and_split_images(other_dir, 'other')

# Print summary
print("\nDataset preparation complete!")
print(f"Training set: {pr_train} 'pr' images, {other_train} 'other' images")
print(f"Testing set: {pr_test} 'pr' images, {other_test} 'other' images")



Starting image processing and dataset preparation...
Processing 1952 pr images: 1561 for training, 391 for testing


Processing pr training images: 100%|██████████| 1561/1561 [01:20<00:00, 19.30it/s]
Processing pr testing images: 100%|██████████| 391/391 [00:11<00:00, 34.68it/s]


Successfully processed 1561/1561 training and 391/391 testing pr images
Processing 805 other images: 644 for training, 161 for testing


Processing other training images: 100%|██████████| 644/644 [00:29<00:00, 21.56it/s]
Processing other testing images: 100%|██████████| 161/161 [00:05<00:00, 29.32it/s]

Successfully processed 644/644 training and 161/161 testing other images

Dataset preparation complete!
Training set: 1561 'pr' images, 644 'other' images
Testing set: 391 'pr' images, 161 'other' images



