In [None]:
#importing required libraries


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from keras import Sequential
from keras.layers import Dense,Dropout, Flatten, BatchNormalization,Activation
from keras.layers import Lambda, Input, GlobalAveragePooling2D
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical
from keras.models import Model
from keras.preprocessing.image import load_img
from keras.callbacks import ReduceLROnPlateau,EarlyStopping, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.autonotebook import tqdm

from sklearn.model_selection import train_test_split


import os
import gc


# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Checking if 'GPU' is aailable or not

print('Yes !! GPU is available' if tf.config.list_physical_devices('GPU') else 'GPU in not available !')
print(tf.config.list_physical_devices('GPU') )

In [None]:
print(tf.__version__)
print(tf.keras.__version__)

In [None]:
# set seed
seed = 42

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed)

In [None]:
#directory of train and test images

train_dir = '../input/cassava-leaf-disease-classification/train_images/'
test_dir = '../input/cassava-leaf-disease-classification/test_images/'

In [None]:
train_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train_df.head(10)

In [None]:
if len(os.listdir(train_dir)) == len(train_df['image_id']):
    print('Number of image ids in train.csv file matches with the actual number of images present in train folder')
else:
    print('Number of image ids in train.csv file does not match with the actual number of images present in train folder')

### Observation:
1. Images of train folder are present in the same order as that of train.csv file.

In [None]:
train_df.info()

In [None]:
import json

with open('../input/cassava-leaf-disease-classification/label_num_to_disease_map.json') as f:
    classes = json.load(f)
    
classes

### Observation:
1. We have 5 different clases
2. Classes 0 - 3 ---> represents diseases 
3. Class 4 ---> Helthy leaf
4. We don't have any null values

In [None]:
train_df['class_name'] = train_df['label'].astype('str').map(classes) #We are converting the data type of 'label' from 'int' to 'str' as map requires 'str' format for mapping
train_df.head()

In [None]:
#Potting the categorical ratio

#function to plot bar height information

def barh(ax):
    
    for p in ax.patches:
        val = p.get_height() #height of the bar
        x = p.get_x()+ p.get_width()/2 # x- position 
        y = p.get_x()+ p.get_height()+100 #y-position
        ax.annotate(round(val,2),(x,y))
    
#Plotting the class distribution in a descending order

plt.figure(figsize = (15,7))
ax0 =sns.countplot(x=train_df['class_name'],order=train_df['class_name'].value_counts().index )
barh(ax0)
plt.show()   

### Observation:
1. We have class imbalance here.i.e. label 'Cassa Mosaic Disease(CMD) has 13158 images while othe classes have less than 3000 images.
2. We will be using stratify= train_df.label.values while spliting for validation data

In [None]:
from plotly.express import pie

class_val = pd.DataFrame(train_df['class_name'].value_counts())
print(class_val)

fig = pie(class_val,values ='class_name', names = list(class_val.index), title = 'Image Class distribution ')
fig.show()


### Observation:
1. We have atleast 1000 images per class or category

### Reading images from thr train DIR

In [None]:
train_df.shape

### Taking data in batches

As we have limited ram memory and huge number of images, We will be taking images in batches and will extract features

In [None]:
#list of img_dir accoring to train.csv file
img_dir =[]
for ix,img_id in enumerate(tqdm(train_df['image_id'].values)):
    img_dir.append(os.path.join(train_dir,img_id))

train_df['img_dir'] = img_dir #creatinga new column
train_df= train_df.astype('str') #datagen requires the target value in str format
train_df.head()


In [None]:
#Creating train and validation dataset ( as we can not apply validation_split =0.2 when we are working with ImageDataGenerator )

train_df, val_df = train_test_split(train_df, test_size = 0.2, random_state = 100,
                                    stratify = train_df['label'].values) # stratify as we have class imbalance
train_df.shape

In [None]:
#validation dataframe shape
val_df.shape

In [None]:
# Hyperparameters
batch_size= 32 # Batch size > 32 will cause ResourceExhaustedError during model.fit()
epochs=20
learn_rate=0.001
# sgd=SGD(lr=learn_rate,momentum=.9,nesterov=False)
adam=Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=None,  amsgrad=False)

input_shape = (300,300,3)
n_classes = len(classes)

In [None]:
# #using Xception 

# from keras.applications.xception import Xception, preprocess_input
# # xception_preprocessor = preprocess_input
# xception = Xception(include_top= False, weights = 'imagenet',
#                        input_shape = input_shape,
#                        classes = n_classes) 

### Data augmentation:
1. Data augmentation encompasses a wide range of techniques used to generate “new” training samples from the original ones by applying random jitters and perturbations (but at the same time ensuring that the class labels of the data are not changed).
2. The basic idea behind the augmentation is to train the model on all kind of possible transformations of an image
3. Here we are using flow_from_dataframe. This is because we have limited ram and we need to get images in batches with respect to the image_id available in the train.csv file

In [None]:
# # we are defining ImageDataGenerator

# datagen = ImageDataGenerator(horizontal_flip = True,
#                             vertical_flip = True,
#                              zoom_range = 0.2,
# #                              shear_range = 0.2,
#                              rescale = 1.0/255,  # Ar RGB colors are presented in 0-155 range (1 pixel = 8 bits, since each bit can be 1 or 0, 8 bits info 2^8 = 256 , 0-255 , total 256)
# #                              width_shift_range = 0.2,
# #                              height_shift_range = 0.2,
#                              fill_mode = 'nearest',
# #                              preprocessing_function = preprocess_input
                             
#                             ) 
# datagen_val = ImageDataGenerator(#preprocessing_function = preprocess_input,
#                                  rescale = 1.0/255) # as we don not need all transformation during validation

# datagen_pred = ImageDataGenerator(#preprocessing_function = preprocess_input,
#                                   rescale = 1.0/255 )

In [None]:
datagen = ImageDataGenerator(
                    rotation_range = 30,
                    width_shift_range = 0.2,
                    height_shift_range = 0.2,
                    shear_range = 0.2,
                    zoom_range = 0.2,
                    brightness_range = [0.5,1.5],
                    horizontal_flip = True,
                    vertical_flip = True,
                    fill_mode = 'nearest'
)

datagen_val = ImageDataGenerator()
datagen_pred = ImageDataGenerator()



In [None]:
#we can use datagen from dataframe : https://vijayabhaskar96.medium.com/tutorial-on-keras-imagedatagenerator-with-flow-from-dataframe-8bd5776e45c1

train_generator= datagen.flow_from_dataframe(dataframe=train_df, directory=train_dir, x_col="image_id", y_col="label",
                                            class_mode="sparse", target_size=(300,300), batch_size=batch_size,shuffle = True,
                                             seed = seed,interpolation = "nearest",
                                            color_mode = 'rgb')

In [None]:
val_generator = datagen_val.flow_from_dataframe(dataframe=val_df, directory=train_dir, x_col="image_id", y_col="label",
                                            class_mode="sparse", target_size=(300,300), batch_size=batch_size,shuffle = False,
                                                seed = seed,interpolation = "nearest",
                                                
                                               color_mode = 'rgb')

In [None]:
#test datagen

pred_datagen = datagen_pred.flow_from_directory("../input/cassava-leaf-disease-classification/",
                                               batch_size = 1, # as we want all images in one batch during prediction
                                               target_size = (300,300),
                                                classes=['test_images'], # https://kylewbanks.com/blog/loading-unlabeled-images-with-imagedatagenerator-flowfromdirectory-keras
                                               color_mode ='rgb',
                                              seed = seed
                                               ) 

In [None]:
#plotting some images from image generator https://www.analyticsvidhya.com/blog/2020/08/image-augmentation-on-the-fly-using-keras-imagedatagenerator/

fig,ax = plt.subplots(nrows=1,ncols=5,figsize=(16,16))


for i in range (5):
    
    image = next(train_generator)[0][0] # getting images
    
    image = np.squeeze(image) # changing size from (1, 200, 200, 3) to (200, 200, 3) for plotting the image
    
    ax[i].imshow(image)
    ax[i].axis('off')

In [None]:
# Stop training when the validation loss metric has stopped decreasing for 5 epochs.
early_stopping = EarlyStopping(monitor = 'val_loss',
                               patience = 5,
                               mode = 'min',
                               restore_best_weights = True)

# Save the model with the minimum validation loss
checkpoint = ModelCheckpoint('best_model.hdf5', 
                             monitor = 'val_loss',
                             verbose = 1,
                             mode = 'min', 
                             save_best_only = True)
# reduce learning rate
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                              factor = 0.2,
                              patience = 3,
                              min_lr = 0.001,
                              mode = 'min',
                              verbose = 1)

## Model Building:


In [None]:
from tensorflow.keras.applications import EfficientNetB3
effB3 = EfficientNetB3(input_shape = input_shape, include_top = False, weights = 'imagenet')

In [None]:
#creating a function to build the FC by taking the base model and return the final model

def build_model(base_modelx):
    
#     for layer in base_modelx.layers:
#         layer.trainable = False
    
    model = Sequential(base_modelx)
#     model.add(Flatten())
    model.add(GlobalAveragePooling2D())
#     model.add(Dropout(0.3))
    model.add(Dense(256,activation ='relu'))#, kernel_regularizer = tf.keras.regularizers.l2(0.01)))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.5))
#     model.add(Dense(512,activation = 'relu',kernel_regularizer = tf.keras.regularizers.l2(0.01)))
#     model.add(BatchNormalization())
#     model.add(Dropout(0.3))
#     model.add(Dense(256,activation = 'relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(128,activation = 'relu'))
#     model.add(Dropout(0.15))
    model.add(Dense(n_classes,activation='softmax'))
    
    print(model.summary())
    
    model.compile(loss = 'sparse_categorical_crossentropy',optimizer = adam,metrics =['accuracy'])
    
    return model

In [None]:
model = build_model(effB3)

In [None]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = val_generator.n//val_generator.batch_size

In [None]:
history = model.fit(train_generator,
                    validation_data = val_generator,
                    epochs = epochs,
                    steps_per_epoch = STEP_SIZE_TRAIN,
                    validation_steps = STEP_SIZE_VALID,
                    callbacks = [early_stopping, checkpoint, reduce_lr]
                   )

In [None]:
# (model.history.history)

In [None]:
#Ploting acc and loss

def plot_result(modelx):
    results = pd.DataFrame({'epochs':list(range(1,len(modelx.history.history['accuracy'])+1)),'Training_acc':modelx.history.history['accuracy'],'Validation_acc':modelx.history.history['val_accuracy'],
                          'Training_loss':modelx.history.history['loss'],'Validation_loss':modelx.history.history['val_loss']})

    plt.figure(figsize=(12,5))
    sns.lineplot(x = 'epochs', y ='Training_acc', data = results, color='r' )
    sns.lineplot(x = 'epochs', y ='Validation_acc', data = results, color='blue' )
    plt.title('Training Accuracy vs Validation Accuracy')
    plt.show()

    plt.figure(figsize=(12,5))
    sns.lineplot(x = 'epochs', y ='Training_loss', data = results, color='r' )
    sns.lineplot(x = 'epochs', y ='Validation_loss', data = results, color='blue' )
    plt.title('Training Loss vs Validation Loss')
    plt.show()

plot_result(model)

In [None]:
model.save("final_model_effB3.h5")

In [None]:
pred = model.predict(pred_datagen)

In [None]:
# First prediction
print(pred[0])
print(f"Max value (probability of prediction): {np.max(pred[0])}") # the max probability value predicted by the model
print(f"Sum: {np.sum(pred[0])}") # because we used softmax activation in our model, this will be close to 1
print(f"Max index: {np.argmax(pred[0])}") # the index of where the max value in predictions[0] occurs
print(f"Predicted label: {classes[str(np.argmax(pred[0]))]}")

### Creating submission file

In [None]:
sub = pd.DataFrame(columns =['image_id','label'])
sub

In [None]:
sub['image_id'] = [os.path.splitext(path)[0]+'.jpg' for path in os.listdir(test_dir)]
sub['label']= np.argmax(pred,axis=1)

sub


In [None]:
#submitting

sub.to_csv('submission.csv',index= None)

### NOTE:
1. This competition requires no internet connection. Hence we will save the model and use that in the inference notebook.
2. The model can be fine tuned to get better accuracy


Inference notebook: https://www.kaggle.com/deepakat002/efficientnetb3-inference-0-860

### Please give a upvote if you like this notebook :)