In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement

<li>VegCart(Random Name) is a fresh produce supply chain company. They are pioneers in solving one of the toughest supply chain problems of the world by leveraging innovative technology. They source fresh produce from farmers and deliver them to businesses within 12 hours. An integral component of their automation process is the development of robust classifiers which can distinguish between images of different types of vegetables, while also correctly labeling images that do not contain any one type of vegetable as noise.</li>

<li>As a starting point, we have been tasked with preparing a multiclass classifier for identifying these vegetables. The dataset provided has all the required images to achieve the task.</li>

In [2]:
import os
import glob
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics


# Tensorflow import
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dense, ReLU, Softmax, BatchNormalization, Dropout
from tensorflow.random import set_seed
from keras.preprocessing.image import ImageDataGenerator



In [3]:
set_seed(111) # set random seed

# To supress any warnings during the flow
import warnings
warnings.filterwarnings('ignore')

# Exploratory Data Analysis

In [4]:
os.mkdir("train")
os.mkdir("test")

In [5]:

import numpy as np
from PIL import Image

def create_matrix_collage(image_paths, rows, cols, output_size):
    collage = Image.new('RGB', output_size)
    width_per_image = output_size[0] // cols
    height_per_image = output_size[1] // rows

    for i in range(rows):
        for j in range(cols):
            img_path = image_paths[i * cols + j]
            img = Image.open(img_path)
            img = img.resize((width_per_image, height_per_image), Image.ANTIALIAS)
            collage.paste(img, (j * width_per_image, i * height_per_image))

    return collage



def image_generator(num = 2000):
    base_path = "/kaggle/input/vegetable-image-dataset/Vegetable Images/train"
    veget = os.listdir(base_path)
    veget.sort()
    labels = []
    for i in range(1, num+1):
        image_paths = []  # Replace with your image paths
        rows = 4  # Number of rows in the collage
        cols = 4  # Number of columns in the collage
        output_size = (128, 128)  # Size of the final collage image (rows * cols)
        temp = [0]*15
        for j in range(1, 17):
            num = np.random.randint(0, 15)
            veg_name = veget[num]
            veg_path = os.path.join(base_path, veg_name)
            img_num = np.random.randint(0, 1000)
            lst = os.listdir(veg_path)
            veg_img_path = os.path.join(veg_path, lst[img_num])
            image_paths.append(veg_img_path)
            temp[num] = 1
        
        collage = create_matrix_collage(image_paths, rows, cols, output_size)
        collage.save(f"train/img_{i}.jpg")
        labels.append(temp)
    return veget, labels



In [6]:
veget, labels = image_generator(2000)

In [7]:
labels_df = pd.DataFrame(labels, columns = veget)

In [8]:
labels_df.head()

Unnamed: 0,Bean,Bitter_Gourd,Bottle_Gourd,Brinjal,Broccoli,Cabbage,Capsicum,Carrot,Cauliflower,Cucumber,Papaya,Potato,Pumpkin,Radish,Tomato
0,1,0,1,1,0,1,1,1,1,0,0,1,1,1,0
1,1,0,1,1,0,1,1,0,1,1,1,0,0,1,1
2,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1
3,1,1,0,1,0,1,1,1,0,1,0,1,1,1,1
4,1,1,1,0,0,1,0,1,1,1,0,1,1,1,1


In [9]:
L = os.listdir("/kaggle/working/train")
L.sort()

In [10]:
veget

['Bean',
 'Bitter_Gourd',
 'Bottle_Gourd',
 'Brinjal',
 'Broccoli',
 'Cabbage',
 'Capsicum',
 'Carrot',
 'Cauliflower',
 'Cucumber',
 'Papaya',
 'Potato',
 'Pumpkin',
 'Radish',
 'Tomato']

In [11]:
newsize = (128, 128)
base_path = "/kaggle/input/vegetable-image-dataset/Vegetable Images/validation"
new_path = "/kaggle/working/train"
img_num = 2001
add_num = 133
labels = []
for id, name in enumerate(veget):
    if id == 14:
        add_num = 138
    temp = [0]*15
    temp[id] = 1
    veg_name_path = os.path.join(base_path, name)
    lst = os.listdir(veg_name_path)
    for i in range(add_num):
        num = np.random.randint(0, len(lst))
        veg_img_path = os.path.join(veg_name_path, lst[num])
        img = Image.open(veg_img_path)
        img = img.resize(newsize)
        img_name = f"img_{img_num}" + ".jpg"
        new_train_path = os.path.join(new_path, img_name)
        img.save(new_train_path)
        labels.append(temp)
        img_num += 1
    

In [12]:
temp_df = pd.DataFrame(labels, columns = veget)


In [13]:
labels_df = pd.concat([labels_df, temp_df])

In [14]:
labels_df.reset_index(drop = True, inplace = True)

In [15]:
labels_df.head()

Unnamed: 0,Bean,Bitter_Gourd,Bottle_Gourd,Brinjal,Broccoli,Cabbage,Capsicum,Carrot,Cauliflower,Cucumber,Papaya,Potato,Pumpkin,Radish,Tomato
0,1,0,1,1,0,1,1,1,1,0,0,1,1,1,0
1,1,0,1,1,0,1,1,0,1,1,1,0,0,1,1
2,1,1,1,1,0,0,1,1,1,1,0,0,1,1,1
3,1,1,0,1,0,1,1,1,0,1,0,1,1,1,1
4,1,1,1,0,0,1,0,1,1,1,0,1,1,1,1


In [16]:
# !zip -r file.zip /kaggle/working/train


In [17]:
# Import the inception v3 model


In [18]:
# def inception_v3(inputs,
#                  num_classes=15,
#                  is_training=True,
#                  dropout_keep_prob=0.8,
#                  min_depth=16,
#                  depth_multiplier=1.0,
#                  prediction_fn=tf.sigmoid,
#                  spatial_squeeze=True,
#                  reuse=None,
#                  scope='InceptionV3'):

In [19]:
# import the inception v3 model
from tensorflow.keras.applications.inception_v3 import InceptionV3

In [20]:
pre_trained_model = InceptionV3(input_shape = (128, 128, 3),
                                include_top = False,
                               weights = 'imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [21]:
for layer in pre_trained_model.layers:
    layers.trainable = False

In [22]:
from keras.models import Model

In [23]:
from tensorflow.keras.optimizers import RMSprop
from sklearn.metrics import f1_score

def model_creator(pre_trained_model):
    # flatten the output layer to one dimension
    x = layers.Flatten()(pre_trained_model.output)
    # add a fully layer with 1024 hidden units and ReLU activation
    x = layers.Dense(1024, activation = 'relu')(x)
    # add a dropout rate of 0.2
    x = layers.Dropout(0.2)(x)
    # add a final sigmoid layer for classfication
    x = layers.Dense(15, activation = "sigmoid")(x)
    model = Model(pre_trained_model.input, x)
    model.compile(optimizer = RMSprop(lr = 0.0001),
                 loss = "binary_crossentropy", 
                 metrics = ['accuracy'])
    return model


In [24]:
from keras.preprocessing.image import ImageDataGenerator

In [25]:
# Preprocessing step

In [26]:
from keras import layers
import tensorflow as tf
data_augmentation = tf.keras.Sequential([
  layers.Rescaling(1./255)
])

In [27]:
import numpy as np
from sklearn.model_selection import StratifiedKFold


In [28]:
X = []; y = []
for img in os.listdir("/kaggle/working/train"):
    img_path = "/kaggle/working/train" + "/" + img
    label = int(img.split("_")[1].split(".")[0])-1
    label = list(labels_df.iloc[label, :].values)
    img = Image.open(img_path)
    arr = np.asarray(img)
    arr = data_augmentation(arr)
    X.append(arr)
    y.append(label)


In [29]:
X = np.array(X)
y = np.array(y)

In [32]:
from sklearn.model_selection import KFold


# Define the number of folds
k = 10  # You can adjust this as needed

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Lists to store training histories
all_histories = []
model = model_creator(pre_trained_model)
# Loop through the folds
for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Create and compile your Keras model
#     model = model_creator(pre_trained_model)

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=50,
        epochs=100,
        verbose=2  # You can adjust verbosity
    )

    # Append the training history to the list
    all_histories.append(history)




Epoch 1/100
72/72 - 47s - loss: 0.5313 - accuracy: 0.0894 - val_loss: 1.5330 - val_accuracy: 0.0350 - 47s/epoch - 658ms/step
Epoch 2/100
72/72 - 6s - loss: 0.4515 - accuracy: 0.0933 - val_loss: 1.6550 - val_accuracy: 0.0350 - 6s/epoch - 86ms/step
Epoch 3/100
72/72 - 6s - loss: 0.4212 - accuracy: 0.1525 - val_loss: 2.0265 - val_accuracy: 0.0800 - 6s/epoch - 85ms/step
Epoch 4/100
72/72 - 6s - loss: 0.4204 - accuracy: 0.1847 - val_loss: 0.4282 - val_accuracy: 0.1875 - 6s/epoch - 86ms/step
Epoch 5/100
72/72 - 6s - loss: 0.4061 - accuracy: 0.2528 - val_loss: 5.4567 - val_accuracy: 0.1575 - 6s/epoch - 86ms/step
Epoch 6/100
72/72 - 6s - loss: 0.3961 - accuracy: 0.2617 - val_loss: 9.9689 - val_accuracy: 0.3225 - 6s/epoch - 86ms/step
Epoch 7/100
72/72 - 6s - loss: 0.3733 - accuracy: 0.3278 - val_loss: 0.4612 - val_accuracy: 0.2550 - 6s/epoch - 85ms/step
Epoch 8/100
72/72 - 6s - loss: 0.3588 - accuracy: 0.4072 - val_loss: 0.3677 - val_accuracy: 0.3400 - 6s/epoch - 86ms/step
Epoch 9/100
72/72 - 6

In [33]:
y_pred = model.predict(X_val)



In [36]:
y_pred_val = y_pred>0.5


In [37]:
y_val

array([[1, 0, 1, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 1, 1]])

In [39]:
true_labels = y_val.copy()
predicted_labels = y_pred_val.copy()

In [40]:
from sklearn.metrics import f1_score

macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

# Calculate micro-average F1 score
micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
print("macro_f1 ", macro_f1)
print("micro_f1 ", micro_f1)

macro_f1  0.9997799779977998
micro_f1  0.9997778271495222


In [41]:
# testing dataset

In [42]:
model.save("model.h5")

In [43]:

import numpy as np
from PIL import Image

def create_matrix_collage(image_paths, rows, cols, output_size):
    collage = Image.new('RGB', output_size)
    width_per_image = output_size[0] // cols
    height_per_image = output_size[1] // rows

    for i in range(rows):
        for j in range(cols):
            img_path = image_paths[i * cols + j]
            img = Image.open(img_path)
            img = img.resize((width_per_image, height_per_image), Image.ANTIALIAS)
            collage.paste(img, (j * width_per_image, i * height_per_image))

    return collage



def image_generator(num = 205):
    base_path = "/kaggle/input/vegetable-image-dataset/Vegetable Images/test"
    veget = os.listdir(base_path)
    veget.sort()
    labels = []
    for i in range(1, num+1):
        image_paths = []  # Replace with your image paths
        rows = 4  # Number of rows in the collage
        cols = 4  # Number of columns in the collage
        output_size = (128, 128)  # Size of the final collage image (rows * cols)
        temp = [0]*15
        for j in range(1, 17):
            num = np.random.randint(0, 15)
            veg_name = veget[num]
            veg_path = os.path.join(base_path, veg_name)
            img_num = np.random.randint(0, 200)
            lst = os.listdir(veg_path)
            veg_img_path = os.path.join(veg_path, lst[img_num])
            image_paths.append(veg_img_path)
            temp[num] = 1
        
        collage = create_matrix_collage(image_paths, rows, cols, output_size)
        collage.save(f"test/img_{i}.jpg")
        labels.append(temp)
    return veget, labels



In [44]:
veget, labels = image_generator(205)
labels_df = pd.DataFrame(labels, columns = veget)
newsize = (128, 128)
base_path = "/kaggle/input/vegetable-image-dataset/Vegetable Images/test"
new_path = "/kaggle/working/test"
img_num = 206
add_num = 3
labels = []
for id, name in enumerate(veget):
    temp = [0]*15
    temp[id] = 1
    veg_name_path = os.path.join(base_path, name)
    lst = os.listdir(veg_name_path)
    for i in range(add_num):
        num = np.random.randint(0, len(lst))
        veg_img_path = os.path.join(veg_name_path, lst[num])
        img = Image.open(veg_img_path)
        img = img.resize(newsize)
        img_name = f"img_{img_num}" + ".jpg"
        new_train_path = os.path.join(new_path, img_name)
        img.save(new_train_path)
        labels.append(temp)
        img_num += 1
    

In [45]:
# !zip -r file.zip /kaggle/working
temp_df = pd.DataFrame(labels, columns = veget)
labels_df = pd.concat([labels_df, temp_df])
labels_df.reset_index(drop = True, inplace = True)

In [46]:
len(labels_df)

250

In [47]:
X_test = []; y_test = []
for img in os.listdir("/kaggle/working/test"):
    img_path = "/kaggle/working/test" + "/" + img
    label = int(img.split("_")[1].split(".")[0])-1
    label = list(labels_df.iloc[label, :].values)
    img = Image.open(img_path)
    arr = np.asarray(img)
    arr = data_augmentation(arr)
    X_test.append(arr)
    y_test.append(label)


In [49]:
X_test = np.array(X_test)
y_test = np.array(y_test)

In [50]:
y_pred_test = model.predict(X_test)



In [51]:
true_labels = y_test.copy()
predicted_labels = y_pred_test>0.5

In [52]:
from sklearn.metrics import f1_score

macro_f1 = f1_score(true_labels, predicted_labels, average='macro')

# Calculate micro-average F1 score
micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
print("macro_f1 ", macro_f1)
print("micro_f1 ", micro_f1)

macro_f1  0.8100320575789599
micro_f1  0.811175785797439


In [None]:
!pip install langchain