In [1]:
# Importing Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Cleaning data 

#### Removing Corrupted Images:

In [2]:
import os 
from PIL import Image
from collections import Counter

# Remove Corrupted or Unreadable Images 
dataset_path = "../Datasets/dataset-resized"
dataset_path2 = "../Datasets/garbage_classification"

def remove_corrupt_images(dataset_path):
    total_removed = 0 
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        
        if not os.path.isdir(class_path):
            continue

        for filename in os.listdir(class_path):
            file_path = os.path.join(class_path, filename)
            try:
                with Image.open(file_path) as img:
                
                    img.verify()
            except Exception as e:
                print(f"Removeing corrupted image: {file_path} - {e}")
                os.remove(file_path)
                total_removed += 1 
    print(f"Removed {total_removed} corrupt images.")

if __name__ == "__main__":
    dataset_path = "../Datasets/dataset-resized"
    remove_corrupt_images(dataset_path)  

if __name__ == "__main__":
    dataset_path = "../Datasets/garbage_classification"
    remove_corrupt_images(dataset_path)  
    

Removed 0 corrupt images.
Removed 0 corrupt images.


#### Checking Image Sizes:


In [3]:
def check_image_sizes(dataset_path):
    size_counter = Counter()
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        if not os.path.isdir(class_path):
            continue
    
    for filename in os.listdir(class_path):
        file_path = os.path.join(class_path, filename)
        try:
            with Image.open(file_path) as img:
                size = img.size
                size_counter[size] += 1
        except Exception as e:
            print(f"Error processing {file_path}):{e}")
            
    print("\nImage Size Distribution:")
    for size, count in size_counter.items():
        print (f"Size{size}: {count} images")
    
check_image_sizes("../Datasets/dataset-resized") 
check_image_sizes("../Datasets/garbage_classification")


Image Size Distribution:
Size(512, 384): 482 images

Image Size Distribution:
Size(260, 193): 3 images
Size(400, 534): 180 images
Size(225, 225): 371 images
Size(249, 202): 5 images
Size(215, 234): 2 images
Size(275, 183): 183 images
Size(194, 259): 85 images
Size(183, 276): 1 images
Size(216, 233): 2 images
Size(220, 220): 14 images
Size(225, 224): 8 images
Size(400, 533): 136 images
Size(292, 173): 3 images
Size(205, 246): 3 images
Size(245, 205): 3 images
Size(207, 244): 3 images
Size(318, 159): 3 images
Size(259, 194): 117 images
Size(168, 168): 1 images
Size(294, 171): 2 images
Size(250, 170): 1 images
Size(198, 255): 1 images
Size(258, 196): 1 images
Size(168, 299): 7 images
Size(187, 270): 6 images
Size(189, 267): 2 images
Size(270, 187): 7 images
Size(183, 275): 39 images
Size(301, 167): 3 images
Size(227, 222): 2 images
Size(262, 192): 16 images
Size(217, 232): 1 images
Size(253, 199): 4 images
Size(266, 189): 12 images
Size(270, 186): 1 images
Size(252, 200): 17 images
Size(

#### Resizing images to 224 x 224 pixels

In [4]:
# Resizing images to 224 x 224 pixels
def resize_images(input_path, output_path, size=(224, 224)):
    os.makedirs(output_path, exist_ok=True)
    
    for class_folder in os.listdir(input_path):
        class_input = os.path.join(input_path, class_folder)
        class_output = os.path.join(output_path, class_folder)
        
        if not os.path.isdir(class_input):
            continue
        
        os.makedirs(class_output, exist_ok=True)
        
        for filename in os.listdir(class_input):
            input_file = os.path.join(class_input, filename)
            output_file = os.path.join(class_output, filename)
        
            try:
                with Image.open(input_file) as img:
                    img = img.convert("RGB")
                    img = img.resize(size)
                    img.save(output_file)
            except Exception as e:
                print(f"Skipping {input_file}: {e}")
    print(f"\n All images resized to {size} and saived in '{output_path}' ")

if __name__ == "__main__":
    input_folder = "../Datasets/dataset-resized"
    output_folder = "resized-cleaned-dataset"
    resize_images(input_folder, output_folder)
    
if __name__ == "__main__":
    input_folder = "../Datasets/garbage_classification"
    output_folder = "clean_garbage_classification"
    resize_images(input_folder, output_folder)


 All images resized to (224, 224) and saived in 'resized-cleaned-dataset' 

 All images resized to (224, 224) and saived in 'clean_garbage_classification' 


#### Counting Images

In [7]:
# Counting Images Per Class Folder/Total

def count_images_in_folders(dataset_path):
    total_images = 0
    
    for class_folder in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_folder)
        
        if not os.path.isdir(class_path):
            continue
        
        num_images = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg','.jpeg','.png'))])
        
        print(f"{class_folder}: {num_images} images")
        total_images += num_images
    print(f"Total images in dataset: {total_images}")
dataset_path = "resized-cleaned-dataset"
count_images_in_folders(dataset_path)         

paper: 594 images
metal: 410 images
cardboard: 403 images
trash: 137 images
glass: 501 images
plastic: 482 images
Total images in dataset: 2527


In [5]:
import os
import shutil

def merge_datasets(dataset1, dataset2, merged_output):
    os.makedirs(merged_output, exist_ok=True)

    for dataset in [dataset1, dataset2]:
        for class_folder in os.listdir(dataset):
            class_path = os.path.join(dataset, class_folder)
            if not os.path.isdir(class_path):
                continue

            merged_class_path = os.path.join(merged_output, class_folder)
            os.makedirs(merged_class_path, exist_ok=True)

            for file in os.listdir(class_path):
                src = os.path.join(class_path, file)
                dst = os.path.join(merged_class_path, file)

                if os.path.exists(dst):
                    base, ext = os.path.splitext(file)
                    file = f"{base}_{os.path.basename(dataset)}{ext}"
                    dst = os.path.join(merged_class_path, file)

                shutil.copy2(src, dst)

            print(f"Merged '{class_folder}' from '{dataset}'")

if __name__ == "__main__":
    dataset1 = "../Datasets/resized-cleaned-dataset"
    dataset2 = "../Datasets/clean_garbage_classification"
    merged_output = "Datasets/combined-cleaned-dataset"
    merge_datasets(dataset1, dataset2, merged_output)


Merged 'paper' from '../Datasets/resized-cleaned-dataset'
Merged 'metal' from '../Datasets/resized-cleaned-dataset'
Merged 'cardboard' from '../Datasets/resized-cleaned-dataset'
Merged 'trash' from '../Datasets/resized-cleaned-dataset'
Merged 'glass' from '../Datasets/resized-cleaned-dataset'
Merged 'plastic' from '../Datasets/resized-cleaned-dataset'
Merged 'paper' from '../Datasets/clean_garbage_classification'
Merged 'green-glass' from '../Datasets/clean_garbage_classification'
Merged 'clothes' from '../Datasets/clean_garbage_classification'
Merged 'metal' from '../Datasets/clean_garbage_classification'
Merged 'cardboard' from '../Datasets/clean_garbage_classification'
Merged 'trash' from '../Datasets/clean_garbage_classification'
Merged 'biological' from '../Datasets/clean_garbage_classification'
Merged 'white-glass' from '../Datasets/clean_garbage_classification'
Merged 'battery' from '../Datasets/clean_garbage_classification'
Merged 'brown-glass' from '../Datasets/clean_garbage_c

In [8]:
# Counting Data Set images with Combined DataSet
dataset_path = "Datasets/combined-cleaned-dataset"
count_images_in_folders(dataset_path)

paper: 1644 images
green-glass: 629 images
clothes: 5325 images
metal: 1179 images
cardboard: 1294 images
trash: 834 images
glass: 501 images
biological: 985 images
white-glass: 775 images
battery: 945 images
brown-glass: 607 images
plastic: 1347 images
shoes: 1977 images
Total images in dataset: 18042
