In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import json, cv2
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 
import keras
from tqdm.notebook import tqdm

In [None]:
base_path = "../input/cassava-leaf-disease-classification/"

In [None]:
with open(base_path + "label_num_to_disease_map.json") as file:
    map_classes = json.loads(file.read())
    
print(json.dumps(map_classes, indent=4))

We can see there are 5 classes, where 0-3 indicates diseases and 4 indicates a healthy cassava plant

In [None]:
#Loading train.csv
tr = pd.read_csv(base_path + 'train.csv')

#Mapping label names to each label per image
tr["label_name"] = tr["label"].astype(str).map(map_classes)

tr.head()

In [None]:
#Loadding the sample submission file
ss = pd.read_csv(base_path + 'sample_submission.csv')
ss

In [None]:
#Get the number of train images
len(os.listdir(base_path + 'train_images/'))

In [None]:
#Checking the shape of images
size_dict = {}
for ind, i in tqdm(enumerate(os.listdir(base_path + 'train_images/'))):
    size_dict[ind] = cv2.imread(base_path + 'train_images/' + i).shape

In [None]:
#We can see all shapes are same viz. 600 x 800 x 3
df_size = pd.DataFrame(size_dict, index = ['Height', 'Width', 'Channels']).T
display(df_size.head())
display(df_size.tail())

In [None]:
sns.countplot(tr['label_name'])
plt.title('# of each label in the dataset')
plt.xticks(rotation= 60);

In [None]:
#Plot random images and their labels
random_index = np.random.randint(0, tr.shape[0]-1, 16)

fig, axes = plt.subplots(4, 4, figsize = (15, 15))
axes = axes.ravel()

for ind, i in enumerate(random_index):
    img = cv2.imread(base_path + '/train_images/' + tr.iloc[i, [0]].values[0])
    label = tr.iloc[i, [2]].values[0]
    axes[ind].imshow(img)
    axes[ind].set_title(label)
    axes[ind].axis('off')
    
plt.subplots_adjust(hspace = 0.4);

## Modelling

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [None]:
#Trrain Validation Split
X = tr['image_id']
Y = tr['label']

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size = 0.3, stratify = Y)

In [None]:
#Converting target to strings as it is classification problem
y_train = y_train.astype(str)
y_val = y_val.astype(str)

In [None]:
#Since we used stratify argument, the distribution is maintained in train and test set

plt.figure(figsize = (15, 6))
plt.subplot(1, 2, 1)
sns.countplot(y_train)
plt.title('Train Disribution')
plt.subplot(1, 2, 2)
sns.countplot(y_val)
plt.title('Test Disribution');

In [None]:
#Combining IDV and DV to pass it to Image Generator
train = pd.concat([X_train, y_train], axis=1)
valid = pd.concat([X_val, y_val], axis=1)

print('Train Shape: ', train.shape)
print('Val Shape: ', valid.shape)

In [None]:
#Creating Training Generator
BATCH_SIZE  = 16
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
                        rescale=1./255
                    ).flow_from_dataframe(train,
                                     directory = os.path.join(base_path, "train_images"),
                                     x_col = "image_id",
                                     y_col = "label",
                                     target_size = (300, 400),
                                     batch_size = BATCH_SIZE,
                                     class_mode = "categorical", 
                                     shuffle = False)

In [None]:
#Creating Validation Generator
BATCH_SIZE  = 16
valid_generator = tf.keras.preprocessing.image.ImageDataGenerator(
                        rescale=1./255
                    ).flow_from_dataframe(valid,
                                     directory = os.path.join(base_path, "train_images"),
                                     x_col = "image_id",
                                     y_col = "label",
                                     target_size = (300, 400),
                                     batch_size = BATCH_SIZE,
                                     class_mode = "categorical",
                                     shuffle = False)

In [None]:
#Loading model architecture for prediction. The training code is in another notebook. I have used the 
#upload notebook option to import the saved weights. Internet should be on, for downloading the weights.

base_model = tf.keras.applications.Xception(
            include_top=False,
            input_tensor=None,
            input_shape=(300, 400, 3),
            pooling=None,
            weights = 'imagenet',
            classifier_activation="softmax",
        )

classes_to_predict = sorted(tr.label.unique())
for layer in base_model.layers:
    layer.trainable = False
    
model = Sequential()
model.add(base_model)
    
model.add(GlobalAveragePooling2D())
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.4))

model.add(Dense(len(classes_to_predict), activation="softmax"))
    
model.summary()

In [None]:
model.load_weights("../input/cassava-leaf-disease-transfer-learning/xception_best_model.h5")

## Model Performance

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

In [None]:
train_predictions = model.predict_generator(train_generator, verbose=1)
val_prediction = model.predict_generator(valid_generator, verbose = 1)

In [None]:
train_preds = np.argmax(train_predictions, axis=1)
val_preds = np.argmax(val_prediction, axis=1)

In [None]:
print('Train Accuracy Score:', accuracy_score(y_train.astype(int), train_preds))
print('Train Recall Score:', recall_score(y_train.astype(int), train_preds, average = 'weighted'))
print('Train Precision Score:', precision_score(y_train.astype(int), train_preds, average = 'weighted'))
print('Train Confusion Matrix: \n', 
        confusion_matrix(y_train.astype(int), train_preds))

In [None]:
print('Validation Accuracy Score:', accuracy_score(y_val.astype(int), val_preds))
print('Validation Recall Score:', recall_score(y_val.astype(int), val_preds, average = 'weighted'))
print('Validation Precision Score:', precision_score(y_val.astype(int), val_preds, average = 'weighted'))
print('Validation Confusion Matrix: \n', 
        confusion_matrix(y_val.astype(int), val_preds))

In [None]:
#Prediction for test set
test_folder = '../input/cassava-leaf-disease-classification/test_images/'
ss["image_id"] =  os.listdir(test_folder)
ss["label"] = 0

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1. / 255
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe = ss,
    x_col='image_id',
    y_col='label',
    directory= test_folder,
    target_size=target_size,
    batch_size=1,
    class_mode=None)

In [None]:
#Saving as csv file
predictions = model.predict_generator(test_generator, verbose=0)
ss["label"] = predictions.argmax(axis=1)
ss.to_csv("submission.csv", index=False)