In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read data

In [None]:
#import
import matplotlib.pyplot as plt
import seaborn as sn
import missingno as msno
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from PIL import Image
basepath = '/kaggle/input/cassava-leaf-disease-classification'
pd.options.mode.chained_assignment = None
TARGET_SZ=300 #Global variable for image size

In [None]:
data=pd.read_csv("/kaggle/input/cassava-leaf-disease-classification/train.csv")
data.head()

# How the image looks?

In [None]:
img = Image.open("../input/cassava-leaf-disease-classification/train_images/1000723321.jpg")
plt.imshow(img)
plt.show()
print(img.size)

# Learn about data

In [None]:
#How is the data distribution?
print(data.groupby('label').nunique())
sn.countplot(x='label',data=data)

# It is an Imbalanced training data set

Remember the disease map
"root": { 5 items

"0":string"Cassava Bacterial Blight (CBB)"

"1":string"Cassava Brown Streak Disease (CBSD)"

"2":string"Cassava Green Mottle (CGM)"

"3":string"Cassava Mosaic Disease (CMD)"

"4":string"Healthy"
}

So we have imbalanced data. 
Category 3 - Mosaic Disease has large number of samples. Does this imbalance matters? Yes. We can try to fix it.

But wondering why Healthy is not as high as this - it seems  easy to get photos of healthy leaves.Shouldn't it?

# Now to remove the imbalance
There are different techniques. We go for a simple method

In [None]:

# To make the data set balanced, we select only 3000 samples of CMD (3) type.

#balanced_data = data.loc[data['label'].isin([0,1,2,4])] 
#data=data.loc[data['label']==3]
#data=data.sample(n=1000,random_state=1)
#data=data.append(balanced_data)


In [None]:
data

# Now let's see how the data looks

In [None]:
print(data.groupby('label').nunique())
sn.countplot(x='label',data=data)

# Check for null values

In [None]:
#let's check the  data for missing values
#msno.bar(data)
data.isnull().sum()

Good. No null values.

In [None]:
# For experimenting take only 9000 records in total

#data=data.sample(n=300,random_state=1)
# and again see the data distribution
#sn.countplot(x='label',data=data)

# Split data to training and validation

In [None]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.2)

# Now define a CNN model

In [None]:
classes_to_prdict=train.label.unique()

model = tf.keras.models.Sequential([
    # input shape is the desired size of the image 300x300 with 3 bytes color
    tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(TARGET_SZ, TARGET_SZ, 3)),# convolution -1
    tf.keras.layers.MaxPooling2D(2, 2), 
    tf.keras.layers.Conv2D(32, (3,3), activation='relu'), #Convolution-2
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'), #Convolution-3
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'), #Convolution-4
    tf.keras.layers.MaxPooling2D(2,2), 
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'), #Convolution-5
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'), #Convolution-6 sha
    tf.keras.layers.MaxPooling2D(2,2),
    #tf.keras.layers.Conv2D(64, (3,3), activation='relu'), #Convolution-7 sha - delete next also
    #tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(), # Flatten before giving to NN
    tf.keras.layers.Dense(512, activation='relu'),  # 512 neuron hidden layer
    tf.keras.layers.Dense(256, activation='relu'),  # 256 neuron hidden layer - new
    tf.keras.layers.Dense(128, activation='relu'),  # 128 neuron hidden layer - new
    tf.keras.layers.Dense(64, activation='relu'),  # 64 neuron hidden layer - new
    tf.keras.layers.Dense(32, activation='relu'),  # 32 neuron hidden layer - sha
    #tf.keras.layers.Dense(16, activation='relu'),  # 16 neuron hidden layer - sha
    tf.keras.layers.Dense(len(classes_to_prdict), activation='softmax') #Multi-class output
])
model.summary()

In [None]:
#We use Adam optimizer

from tensorflow.keras.optimizers import Adam

#model.compile(loss='binary_crossentropy',
model.compile(loss='categorical_crossentropy',
#model.compile(loss='sparse_categorical_crossentropy',
              #optimizer=Adam(lr=0.001),
              optimizer=Adam(lr=0.001),
              metrics=['accuracy'])
#optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4)

In [None]:
# All images will be rescaled by 1./255
train_datagen = ImageDataGenerator(rescale=1./255)
validation_datagen = ImageDataGenerator(rescale=1./255)
#targetSz=300
targetSz=TARGET_SZ
#batchSz=128
#batchSz=100
batchSz=333

train['label'] = train['label'].astype(str)
val['label'] = val['label'].astype(str)

# Flow training images in batches of 128 using train_datagen generator
train_generator = train_datagen.flow_from_dataframe(train, 
                                                    directory = os.path.join(basepath, 'train_images'),
                                                    x_col = 'image_id',
                                                    y_col = 'label',
                                                    target_size = (TARGET_SZ, TARGET_SZ),
                                                    batch_size = batchSz,
                                                    class_mode = 'categorical')

# Flow training images in batches of 128 using train_datagen generator
validation_generator = train_datagen.flow_from_dataframe(val, 
                                                    directory = os.path.join(basepath, 'train_images'),
                                                    x_col = 'image_id',
                                                    y_col = 'label',
                                                    target_size = (TARGET_SZ, TARGET_SZ),
                                                    batch_size = batchSz,
                                                    class_mode = 'categorical')

# Train the model

In [None]:
#The parameters "steps_per_epoch" and "validation_steps" have to be equal to the
#length of the dataset divided by the batch size. Otherwise within the first epoch itself it comes out. As
#Then I found out from stack overflow the above rule. Not sure why?

callbacks = ReduceLROnPlateau(monitor='val_acc', 
                              #factor=0.5, 
                              factor=0.2,
                              patience=5, 
                              verbose=1, 
                              #min_lr=0.0001)
                              min_lr=0.001)

history = model.fit_generator(
            train_generator,
            #steps_per_epoch = 3,
            steps_per_epoch = 27,
            #epochs = 3,
            #epochs = 25,
            epochs = 25,
            verbose = 1,
            validation_data = validation_generator,
            #validation_steps = 3,
            validation_steps = 27,
            callbacks = [callbacks])

# Plot loss progression 

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss over epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

# Plot accuracy 

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy over epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='best')
plt.show()

# OK Model is ready. Now apply on test

In [None]:
#Now use the model on test images
test_folder = os.path.join(basepath,  "test_images")

#test_images = os.listdir(os.path.join(basepath,  "test_images"))
test_images = os.listdir(test_folder)
predictions=[]

In [None]:
for i in test_images:
    #image = Image.open(f'/kaggle/input/cassava-leaf-disease-classification/test_images/{i}')
    print(i) 
    tmp_image=os.path.join(test_folder,i)
    print(tmp_image) 
    image = Image.open(tmp_image)
    image = image.resize((targetSz, targetSz))
       
    
    image = np.expand_dims(image, axis = 0)
    image = image/255.0
    predictions.append(np.argmax(model.predict(image)))
                       
submission = pd.DataFrame({'image_id': test_images, 'label': predictions})
submission


# Finally submit

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index = False)