In [1]:
import os
import matplotlib.pyplot  as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image, ImageOps
import numpy as np

In [2]:
data_dir = "../data/raw/fruit_images"
bad_quality_path = data_dir + "/bad_quality_fruits"
good_quality_path = data_dir + "/good_quality_fruits"
mixed_quality_path = data_dir + "/mixed_quality_fruits"

file_paths=[]
labels=[]
height = 0
width = 0 
samples = 0
sample_count = 20

for fruit_quality in [bad_quality_path, good_quality_path, mixed_quality_path]:
    fruit_list = os.listdir(fruit_quality)
    for fruit in fruit_list:
        fruit_path = os.path.join(fruit_quality, fruit)
        image_list = os.listdir(fruit_path)
        for i, img in enumerate(image_list):
            image_path = os.path.join(fruit_path, img)
            if i < sample_count:
                img = plt.imread(image_path)               
                height += img.shape[0]
                width += img.shape[1]
                samples +=1
            file_paths.append(image_path)
            if fruit_quality == mixed_quality_path:
                labels.append(fruit + '_Mixed')
            else:
                labels.append(fruit)

fruit_series = pd.Series(file_paths, name='image')
label_series = pd.Series(labels, name='label')
df = pd.concat([fruit_series, label_series], axis=1)

In [3]:
df.head(10)

Unnamed: 0,image,label
0,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
1,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
2,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
3,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
4,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
5,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
6,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
7,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
8,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad
9,../data/raw/fruit_images/bad_quality_fruits\Ap...,Apple_Bad


In [4]:
def df_information(df_: pd.DataFrame):    
    class_count= len(list(df["label"].unique()))
    print(f"The dataset contains {df_.shape[0]} images.")
    print(f"The dataset contains the following {class_count} distinct classes. \n")

    for fruit_class in list(df["label"].unique()):
        print(fruit_class)

    items_per_class = list(df_["label"].value_counts())
    print(f"\nEach of the above classses contains {items_per_class} images.")

In [5]:
df_information(df)

The dataset contains 19522 images.
The dataset contains the following 18 distinct classes. 

Apple_Bad
Banana_Bad
Guava_Bad
Lime_Bad
Orange_Bad
Pomegranate_Bad
Apple_Good
Banana_Good
Guava_Good
Lime_Good
Orange_Good
Pomegranate_Good
Apple_Mixed
Banana_Mixed
Guava_Mixed
Lemon_Mixed
Orange_Mixed
Pomegranate_Mixed

Each of the above classses contains [5940, 1216, 1187, 1159, 1152, 1149, 1137, 1129, 1113, 1094, 1087, 1085, 285, 278, 148, 125, 125, 113] images.


In [6]:
# Split the DataFrame into train (70%), validation (15%), and test (15%)
train_df, test_and_val_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(test_and_val_df, test_size=0.5, random_state=42)

# Check the lengths of the resulting DataFrames
print("Train set length:", len(train_df))
print("Validation set length:", len(val_df))
print("Test set length:", len(test_df))


Train set length: 13665
Validation set length: 2928
Test set length: 2929


In [7]:
def df_trim(df_: pd.DataFrame, desired_samples_per_class: int) -> pd.DataFrame:

    # Create an empty DataFrame to store the trimmed data
    trimmed_train_df = pd.DataFrame(columns=df_.columns)

    # Iterate through each class and select the first 200 samples
    for class_name in train_df['label'].unique():
        class_samples = train_df[train_df['label'] == class_name].head(desired_samples_per_class)
        trimmed_train_df = pd.concat([trimmed_train_df, class_samples])

    # Reset the index of the trimmed DataFrame
    trimmed_train_df.reset_index(drop=True, inplace=True)

    return trimmed_train_df

In [8]:
trimmed_train_df = df_trim(df_= train_df, desired_samples_per_class=200)

df_information(df_= trimmed_train_df)

The dataset contains 3149 images.
The dataset contains the following 18 distinct classes. 

Apple_Bad
Banana_Bad
Guava_Bad
Lime_Bad
Orange_Bad
Pomegranate_Bad
Apple_Good
Banana_Good
Guava_Good
Lime_Good
Orange_Good
Pomegranate_Good
Apple_Mixed
Banana_Mixed
Guava_Mixed
Lemon_Mixed
Orange_Mixed
Pomegranate_Mixed

Each of the above classses contains [200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 193, 100, 88, 85, 83] images.


In [9]:
def classes_with_less_than_n_samples(df_: pd.DataFrame, desired_samples_per_class: int):
    class_counts = df_['label'].value_counts()
    classes_with_less_than_n_samples_list =  class_counts[class_counts < desired_samples_per_class].index.tolist()
    print(f"Classes with less than {desired_samples_per_class} samples : {classes_with_less_than_n_samples_list}")

    return classes_with_less_than_n_samples_list

In [17]:
# Define a data augmentation function
import random

def augment_image(image_path, save_folder):
    os.makedirs(save_folder, exist_ok=True)
    image = Image.open(image_path)
    # Apply random rotation (you can customize the rotation angle)
    angle = np.random.randint(-15, 15)
    augmented_image = image.rotate(angle)

    # Apply horizontal flip with a 50% chance
    if np.random.choice([True, False]):
        augmented_image = ImageOps.mirror(augmented_image)

    if save_folder:
        # Ensure the save folder exists
        os.makedirs(save_folder, exist_ok=True)
        filename = f"AUG_IMG_{np.random.randint(1000)}_{angle}.jpg"
        augmented_image.save(os.path.join(save_folder, filename))
    
    return augmented_image


In [22]:
def df_balance(df_: pd.DataFrame, desired_samples_per_class : int) -> pd.DataFrame:
    save_folder = os.path.join(data_dir, "augmented")
    print(save_folder)
    # Iterate through the classes with fewer than 200 samples
    target_class = classes_with_less_than_n_samples(df_=df_, desired_samples_per_class=200)
    for class_name in target_class:
            class_samples = df_[df_["label"]==class_name].value_counts()
            class_df = df_[df_["label"] == class_name]

            images_to_augment = len(class_samples)
            while images_to_augment < desired_samples_per_class:
                
                # Choose an image from the class
                random_index = random.randint(0, len(class_df) - 1)
                # Get the image file path at the random index
                random_image = class_df.iloc[random_index]["image"]

                # Apply data augmentation to generate a new image
                augmented_image = augment_image(image_path=random_image, save_folder=save_folder)

                # Append the augmented image to your dataset
                new_df = pd.DataFrame({'image': [augmented_image], 'label': [class_name]})
                df_ = pd.concat([df_, new_df], ignore_index=True)
                images_to_augment+=1
            print(f"For class [{class_name}] I have augmented [{desired_samples_per_class-len(class_samples)}] images.")
    return df_


In [23]:
df_balanced = df_balance(trimmed_train_df, 200)

../data/raw/fruit_images\augmented
Classes with less than 200 samples : ['Lemon_Mixed', 'Guava_Mixed', 'Orange_Mixed', 'Pomegranate_Mixed', 'Apple_Mixed']
For class [Lemon_Mixed] I have augmented [7] images.
For class [Guava_Mixed] I have augmented [100] images.
For class [Orange_Mixed] I have augmented [112] images.
For class [Pomegranate_Mixed] I have augmented [115] images.
For class [Apple_Mixed] I have augmented [117] images.


In [24]:
df_information(df_balanced)

The dataset contains 3600 images.
The dataset contains the following 18 distinct classes. 

Apple_Bad
Banana_Bad
Guava_Bad
Lime_Bad
Orange_Bad
Pomegranate_Bad
Apple_Good
Banana_Good
Guava_Good
Lime_Good
Orange_Good
Pomegranate_Good
Apple_Mixed
Banana_Mixed
Guava_Mixed
Lemon_Mixed
Orange_Mixed
Pomegranate_Mixed

Each of the above classses contains [200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200] images.
