In [3]:
# Importing Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Cleaning data 

#### Removing Corrupted Images:

In [16]:
import os 
from PIL import Image
from collections import Counter

# Remove Corrupted or Unreadable Images 
dataset_path = 'dataset-resized'
def remove_corrupt_images(dataset_path):
    total_removed = 0 
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        
        if not os.path.isdir(class_path):
            continue

        for filename in os.listdir(class_path):
            file_path = os.path.join(class_path, filename)
            try:
                with Image.open(file_path) as img:
                
                    img.verify()
            except Exception as e:
                print(f"Removeing corrupted image: {file_path} - {e}")
                os.remove(file_path)
                total_removed += 1 
    print(f"Removed {total_removed} corrupt images.")

if __name__ == "__main__":
    dataset_path = "dataset-resized/"
    remove_corrupt_images(dataset_path)  
    

Removed 0 corrupt images.


#### Checking Image Sizes:


In [17]:
def check_image_sizes(dataset_path):
    size_counter = Counter()
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        if not os.path.isdir(class_path):
            continue
    
    for filename in os.listdir(class_path):
        file_path = os.path.join(class_path, filename)
        try:
            with Image.open(file_path) as img:
                size = img.size
                size_counter[size] += 1
        except Exception as e:
            print(f"Error processing {file_path}):{e}")
            
    print("\nImage Size Distribution:")
    for size, count in size_counter.items():
        print (f"Size{size}: {count} images")
    
check_image_sizes(dataset_path) 


Image Size Distribution:
Size(512, 384): 482 images


#### Resizing images to 224 x 224 pixels

In [8]:
# Resizing images to 224 x 224 pixels
def resize_images(input_path, output_path, size=(224, 224)):
    os.makedirs(output_path, exist_ok=True)
    
    for class_folder in os.listdir(input_path):
        class_input = os.path.join(input_path, class_folder)
        class_output = os.path.join(output_path, class_folder)
        
        if not os.path.isdir(class_input):
            continue
        
        os.makedirs(class_output, exist_ok=True)
        
        for filename in os.listdir(class_input):
            input_file = os.path.join(class_input, filename)
            output_file = os.path.join(class_output, filename)
        
            try:
                with Image.open(input_file) as img:
                    img = img.convert("RGB")
                    img = img.resize(size)
                    img.save(output_file)
            except Exception as e:
                print(f"Skipping {input_file}: {e}")
    print(f"\n All images resized to {size} and saived in '{output_path}' ")

if __name__ == "__main__":
    input_folder = "dataset-resized"
    output_folder = "resized-cleaned-dataset"
    resize_images(input_folder, output_folder)


 All images resized to (224, 224) and saived in 'resized-cleaned-dataset' 


#### Counting Images

In [18]:
# Counting Images Per Class Folder/Total

def count_images_in_folders(dataset_path):
    total_images = 0
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        
        if not os.path.isdir(class_path):
            continue
        
        num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg','.jpeg','.png'))])
        
        print(f"{class_folder}: {num_images} images")
        total_images += num_images
    print(f"Total images in dataset: {total_images}")
dataset_path = "resized-cleaned-dataset"
count_images_in_folders(dataset_path)         

paper: 594 images
metal: 410 images
cardboard: 403 images
trash: 137 images
glass: 501 images
plastic: 482 images
Total images in dataset: 2527
