In [None]:
!pip install split-folders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [None]:
import splitfolders
import PIL
import os
import sys
import albumentations as A
import cv2
import pandas as pd
import numpy as np
import random
from pathlib import Path
from PIL import UnidentifiedImageError
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#%cd /content/drive/MyDrive/IS4242 Group 1

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: '/MyDrive/IS4242 Group 1'
/content


### Read Images Ten Classes

In [None]:
# Checks for corrupted images in these folders and removes them
paths = []
paths.append(Path("./training_images_ten/Capsicum_Green").rglob("*jpg"))
paths.append(Path("./training_images_ten/Capsicum_Red").rglob("*jpg"))
paths.append(Path("./training_images_ten/Capsicum_Yellow").rglob("*jpg"))
paths.append(Path("./training_images_ten/Tomato").rglob("*jpg"))
paths.append(Path("./training_images_ten/Apple_Red").rglob("*jpg"))
paths.append(Path("./training_images_ten/Apple_Green").rglob("*jpg"))
paths.append(Path("./training_images_ten/Banana").rglob("*jpg"))
paths.append(Path("./training_images_ten/Orange").rglob("*jpg"))
paths.append(Path("./training_images_ten/Pear").rglob("*jpg"))
paths.append(Path("./training_images_ten/Lemon").rglob("*jpg"))
for path in paths:
  for img_p in path:
    try:
        img = PIL.Image.open(img_p)
    except PIL.UnidentifiedImageError:
            print(img_p)
            !rm $img_p

In [None]:
input_folder = "./training_images_ten"
output_folder = "./codes/data/ten_classes"
# For train/val split use 0.75,0.25
# For train/val/test split use 0.8,0.1,0.1
splitfolders.ratio(input_folder, output_folder, seed=4242, ratio=(0.75,0.25), group_prefix=None, move=False)

Copying files: 200 files [00:43,  4.58 files/s]


### Read Images Seven Classes

In [None]:
# Checks for corrupted images in these folders and removes them
paths = []
paths.append(Path("./training_images_seven/Capsicum").rglob("*jpg"))
paths.append(Path("./training_images_seven/Tomato").rglob("*jpg"))
paths.append(Path("./training_images_seven/Apple").rglob("*jpg"))
paths.append(Path("./training_images_seven/Banana").rglob("*jpg"))
paths.append(Path("./training_images_seven/Orange").rglob("*jpg"))
paths.append(Path("./training_images_seven/Pear").rglob("*jpg"))
paths.append(Path("./training_images_seven/Lemon").rglob("*jpg"))
for path in paths:
  for img_p in path:
    try:
        img = PIL.Image.open(img_p)
    except PIL.UnidentifiedImageError:
            print(img_p)
            !rm $img_p

In [None]:
input_folder = "./training_images_seven"
output_folder = "./codes/data/seven_classes"
# For train/val split use 0.75,0.25
# For train/val/test split use 0.8,0.1,0.1
splitfolders.ratio(input_folder, output_folder, seed=4242, ratio=(0.75,0.25), group_prefix=None, move=False)

Copying files: 140 files [00:29,  4.71 files/s]


### Image Augmentation

In [None]:
#Read the image files
#Set os to the directory containing all the images with the folder name being the label
def read_images(dir_list):
  os.chdir(dir_list)
  dir_list = os.listdir(dir_list)
  df = pd.DataFrame({'Image': [], 'Label': []})
  for folder in dir_list:
    for file in os.listdir(folder):
        image = cv2.imread(folder + "/" + file)
        #print(folder + "/" + file)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        new_row = {'Image': image, 'Label': folder}
        df.loc[len(df)] = new_row
  return df

In [None]:
#Image Augmentation
#Can use keras.ImageDataGenerator also
def transform_image(image):
    #Function to augment image
    transform = A.Compose([
        #Each transformation has probability = 0.5 of occurring by default
        A.LongestMaxSize(max_size = 512, interpolation = 3),
        A.HorizontalFlip(p=0.3),
        A.VerticalFlip(p=0.3),
        A.ISONoise(p=0.5),
        A.AdvancedBlur(p=0.3),
        A.RandomBrightnessContrast(p=0.3),
    ])
    augmented_image = transform(image=image)['image']
    return augmented_image

#First, augment the training data
def augment_data(X, n):
    df_temp = pd.DataFrame({'Image': [], 'Label': ""})
    X = X.reset_index()
    X = X.drop(labels = 'index', axis = 1)
    le = LabelEncoder()
    le.fit(X['Label'])
    labels = le.transform(X['Label'])
    classes = list(le.classes_)
    for label in classes:
        #Randomly selects image sequence to augment
        indexes = X[X['Label'] == label].index
        sequence = random.choices(indexes, k = n)
        for i in range(len(sequence)):
            #Augments image and adds to the new training dataframe
            index = sequence[i]
            image = X.iloc[index]['Image']
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            augmented_image = transform_image(image)
            new_row = {'Image': augmented_image, 'Label': label}
            df_temp.loc[len(df_temp)] = new_row
    X = df_temp.drop('Label', axis = 1)
    y = []
    for i in range(len(classes)):
        temp = [i] * n
        y += temp
    return X, y, le

#Save the augmented images
def save_images(X_train_aug, y_train_aug, le, save_dir):
  for i in range(X_train_aug.shape[0]):
    image = X_train_aug.iloc[i]['Image']
    labels = le.inverse_transform(y_train_aug)
    label = labels[i]
    os.chdir(save_dir + label + "/")
    filename = str(i % 100) + ".jpg"
    print(filename)
    cv2.imwrite(filename, image)

### Ten Classes

In [None]:
#Set random seed for repeatable results
random.seed(4242)

df_ten = read_images("/content/drive/MyDrive/IS4242 Group 1/codes/data/ten_classes/train")
X_train_aug_ten, y_train_aug_ten, le_ten = augment_data(df_ten, 100)
save_images(X_train_aug_ten, y_train_aug_ten, le_ten, "/content/drive/MyDrive/IS4242 Group 1/codes/data/ten_classes/augmented/")

  element = np.asarray(element)


0.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
11.jpg
12.jpg
13.jpg
14.jpg
15.jpg
16.jpg
17.jpg
18.jpg
19.jpg
20.jpg
21.jpg
22.jpg
23.jpg
24.jpg
25.jpg
26.jpg
27.jpg
28.jpg
29.jpg
30.jpg
31.jpg
32.jpg
33.jpg
34.jpg
35.jpg
36.jpg
37.jpg
38.jpg
39.jpg
40.jpg
41.jpg
42.jpg
43.jpg
44.jpg
45.jpg
46.jpg
47.jpg
48.jpg
49.jpg
50.jpg
51.jpg
52.jpg
53.jpg
54.jpg
55.jpg
56.jpg
57.jpg
58.jpg
59.jpg
60.jpg
61.jpg
62.jpg
63.jpg
64.jpg
65.jpg
66.jpg
67.jpg
68.jpg
69.jpg
70.jpg
71.jpg
72.jpg
73.jpg
74.jpg
75.jpg
76.jpg
77.jpg
78.jpg
79.jpg
80.jpg
81.jpg
82.jpg
83.jpg
84.jpg
85.jpg
86.jpg
87.jpg
88.jpg
89.jpg
90.jpg
91.jpg
92.jpg
93.jpg
94.jpg
95.jpg
96.jpg
97.jpg
98.jpg
99.jpg
0.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
11.jpg
12.jpg
13.jpg
14.jpg
15.jpg
16.jpg
17.jpg
18.jpg
19.jpg
20.jpg
21.jpg
22.jpg
23.jpg
24.jpg
25.jpg
26.jpg
27.jpg
28.jpg
29.jpg
30.jpg
31.jpg
32.jpg
33.jpg
34.jpg
35.jpg
36.jpg
37.jpg
38.jpg
39.jpg
40.jpg
41.jpg
42.jpg
43.jpg
44.jpg
45.jp

In [None]:
#Plot Image Dimensions


### Seven Classes

In [None]:
df_seven = read_images("/content/drive/MyDrive/IS4242 Group 1/codes/data/seven_classes/train")
X_train_aug_seven, y_train_aug_seven, le_seven = augment_data(df_seven, 100)
save_images(X_train_aug_seven, y_train_aug_seven, le_seven, "/content/drive/MyDrive/IS4242 Group 1/codes/data/seven_classes/augmented/")

  element = np.asarray(element)


0.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
11.jpg
12.jpg
13.jpg
14.jpg
15.jpg
16.jpg
17.jpg
18.jpg
19.jpg
20.jpg
21.jpg
22.jpg
23.jpg
24.jpg
25.jpg
26.jpg
27.jpg
28.jpg
29.jpg
30.jpg
31.jpg
32.jpg
33.jpg
34.jpg
35.jpg
36.jpg
37.jpg
38.jpg
39.jpg
40.jpg
41.jpg
42.jpg
43.jpg
44.jpg
45.jpg
46.jpg
47.jpg
48.jpg
49.jpg
50.jpg
51.jpg
52.jpg
53.jpg
54.jpg
55.jpg
56.jpg
57.jpg
58.jpg
59.jpg
60.jpg
61.jpg
62.jpg
63.jpg
64.jpg
65.jpg
66.jpg
67.jpg
68.jpg
69.jpg
70.jpg
71.jpg
72.jpg
73.jpg
74.jpg
75.jpg
76.jpg
77.jpg
78.jpg
79.jpg
80.jpg
81.jpg
82.jpg
83.jpg
84.jpg
85.jpg
86.jpg
87.jpg
88.jpg
89.jpg
90.jpg
91.jpg
92.jpg
93.jpg
94.jpg
95.jpg
96.jpg
97.jpg
98.jpg
99.jpg
0.jpg
1.jpg
2.jpg
3.jpg
4.jpg
5.jpg
6.jpg
7.jpg
8.jpg
9.jpg
10.jpg
11.jpg
12.jpg
13.jpg
14.jpg
15.jpg
16.jpg
17.jpg
18.jpg
19.jpg
20.jpg
21.jpg
22.jpg
23.jpg
24.jpg
25.jpg
26.jpg
27.jpg
28.jpg
29.jpg
30.jpg
31.jpg
32.jpg
33.jpg
34.jpg
35.jpg
36.jpg
37.jpg
38.jpg
39.jpg
40.jpg
41.jpg
42.jpg
43.jpg
44.jpg
45.jp

In [None]:
#Analysis of final datasets

#Plot image dimensions

def read_images2():
    #Reads images from the current directory after data preprocessing
    #Change directory to the seven_classes/ten_classes folder
    os.chdir("augmented")
    dir_list = os.listdir()
    aug_df = pd.DataFrame({'Image': [], 'Label': []})
    for folder in dir_list:
        for file in os.listdir(folder):
            image = cv2.imread(folder + "/" + file)
            print(folder + "/" + file)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            new_row = {'Image': image, 'Label': folder}
            aug_df.loc[len(aug_df)] = new_row
    os.chdir("../train")
    dir_list = os.listdir()
    c_df = pd.DataFrame({'Image': [], 'Label': []})
    for folder in dir_list:
        for file in os.listdir(folder):
            image = cv2.imread(folder + "/" + file)
            print(folder + "/" + file)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            new_row = {'Image': image, 'Label': folder}
            c_df.loc[len(c_df)] = new_row   
    return aug_df, c_df


In [None]:
#Seven classes
#os.chdir("../seven_classes")
aug7_df, c7_df = read_images2()

#Plot the dimensions
def to_height_width(df):
    h = []
    w = []
    l = []
    for i in range(df.shape[0]):
        h.append(df.iloc[i]['Image'].shape[0])
        w.append(df.iloc[i]['Image'].shape[1])
        l.append(df.iloc[i]['Label'])
    df = pd.DataFrame({"Height": h,
                      "Width": w,
                      "Label": l})
    return df

aug7_df = to_height_width(aug7_df)
c7_df = to_height_width(c7_df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = aug7_df, x = "Width", y = "Height", hue = "Label").set(title = 'Image Dimensions for Augmented Dataset (7 Classes)')

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = c7_df, x = "Width", y = "Height", hue = "Label").set(title = 'Image Dimensions for Control Dataset (7 Classes)')

In [None]:
#Ten classes
os.chdir("../../ten_classes")
aug10_df, c10_df = read_images2()
aug10_df = to_height_width(aug10_df)
c10_df = to_height_width(c10_df)

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = aug10_df, x = "Width", y = "Height", hue = "Label").set(title = 'Image Dimensions for Augmented Dataset (10 Classes)')

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(data = c10_df, x = "Width", y = "Height", hue = "Label").set(title = 'Image Dimensions for Control Dataset (10 Classes)')