# ***Data2040 Midterm: Cassava Leaf Disease Classification (Kaggle)***

**Authors: Haoda Song, Siyuan Li, Yuyang Li**

**Brown Data Science Initiative**



In [None]:
#Load packages
import keras
from keras.layers import Dense, Dropout, Input, MaxPooling2D, ZeroPadding2D, Conv2D, Flatten,BatchNormalization
from tensorflow.keras.layers import AveragePooling2D, ZeroPadding2D
from tensorflow.keras.layers import Concatenate
from keras.models import Sequential, Model
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam, SGD
from keras.preprocessing.image import img_to_array, load_img, ImageDataGenerator
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

from tensorflow.keras.layers import MaxPool2D, AveragePooling2D, GlobalAveragePooling2D
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid

from zipfile import ZipFile
import time
from datetime import timedelta
from io import BytesIO

import PIL.Image

import pickle
import os
import random

import json

**Add a "shortcut" from the "Shared Drive" to "My Drive". The data folder is called "Data_leaf"**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Data_leaf/"

In [None]:
archive_train = ZipFile(path + "train_images.zip", 'r')

In [None]:
train_list = ZipFile.namelist(archive_train)

In [None]:
train_img_list = train_list[2:42796:2]
len(train_img_list) #21397 Images in the file

In [None]:
class_data = pd.read_csv(path + "train.csv", header=0, sep=',', quotechar='"')
print(class_data.shape)
print(class_data.head(5))

In [None]:
#Mapping each disease code and the real disease name for visualization purpose
class_data = pd.read_csv(path + "train.csv", header=0, sep=',', quotechar='"')

with open(path + '/label_num_to_disease_map.json') as f:
  disease_dict = json.load(f)
disease_df = pd.DataFrame(list(disease_dict.items()),columns = ['label','real_label'])

for i in range(len(disease_df)):
  disease_df['label'][i] = int(i)

actual_class = pd.merge(class_data,disease_df,on='label') 
#Count the number of images in each disease
obs_in_actual = actual_class.groupby(['label','real_label']).size()
print(obs_in_actual) 

In [None]:
ax = (actual_class.value_counts(actual_class['real_label'], ascending=True)
                 .plot(kind='barh', fontsize="20", 
                       title="Class Distribution", figsize=(8,5)))

ax.set(xlabel="Images per class", ylabel="Classes")
ax.xaxis.label.set_size(15)
ax.yaxis.label.set_size(15)
ax.title.set_size(15)
plt.show()

# Image Preprocessing




In [None]:
image_resize = 100
s = (len(train_img_list[:]), image_resize, image_resize,3)
allImage = np.zeros(s)
labels = np.empty(len(train_img_list[:]), dtype = "object")
for i in range(0,len(train_img_list[:])):
    filename = BytesIO(archive_train.read(train_img_list[i]))
    image = PIL.Image.open(filename)
    image = image.resize((image_resize, image_resize))
    image = np.array(image)
    image = np.clip(image/255.0, 0.0, 1.0)

    allImage[i]=image
    labels[i]=list(class_data[train_img_list[i][13:] == class_data['image_id']]['label'])[0]

In [None]:
#reshape labels
train_img = allImage
labels_reshape = labels
labels_reshape = labels.reshape(labels.shape[0],1)
labels_reshape = pd.get_dummies(labels)
labels_reshape.shape

In [None]:
#Training and Test split
random.seed(2040)
X = train_img
y = labels_reshape
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
#Data Augmentation
train_datagen = ImageDataGenerator(
    rotation_range=45,
    width_shift_range=0.3,
    height_shift_range=0.3,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

test_datagen = ImageDataGenerator()

In [None]:
BATCH_SIZE = 64
training_set = train_datagen.flow(X_train, y=y_train, batch_size=BATCH_SIZE)
testing_set = test_datagen.flow(X_test, y=y_test, batch_size=BATCH_SIZE)

# Baseline CNN




In [None]:
input_shape = (image_resize, image_resize, 3)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',kernel_initializer='he_normal',input_shape=input_shape))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.4))

model.add(Flatten())
model.add(Dense(384, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(5, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.001),
              metrics=['accuracy'])
model.summary()

In [None]:
early_stopping = EarlyStopping(
    monitor='val_accuracy', 
    patience=10)

history = model.fit(
  training_set, 
  steps_per_epoch = X_train.shape[0] // BATCH_SIZE,
  validation_data = testing_set, 
  epochs = 100, 
  callbacks = [early_stopping, tensorboard_callback('project')],
  verbose = 1
)

#GoogLeNet
(For later use)
Citation: https://www.kaggle.com/luckscylla/googlenet-implementation

In [None]:
def inception(x, filters):
    # 1x1
    path1 = Conv2D(filters=filters[0], kernel_size=(1,1), strides=1, padding='same', activation='relu')(x)

    # 1x1->3x3
    path2 = Conv2D(filters=filters[1][0], kernel_size=(1,1), strides=1, padding='same', activation='relu')(x)
    path2 = Conv2D(filters=filters[1][1], kernel_size=(3,3), strides=1, padding='same', activation='relu')(path2)
    
    # 1x1->5x5
    path3 = Conv2D(filters=filters[2][0], kernel_size=(1,1), strides=1, padding='same', activation='relu')(x)
    path3 = Conv2D(filters=filters[2][1], kernel_size=(5,5), strides=1, padding='same', activation='relu')(path3)

    # 3x3->1x1
    path4 = MaxPooling2D(pool_size=(3,3), strides=1, padding='same')(x)
    path4 = Conv2D(filters=filters[3], kernel_size=(1,1), strides=1, padding='same', activation='relu')(path4)

    return Concatenate(axis=-1)([path1,path2,path3,path4])


def auxiliary(x, name=None):
    layer = AveragePooling2D(pool_size=(5,5), strides=3, padding='valid')(x)
    layer = Conv2D(filters=128, kernel_size=(1,1), strides=1, padding='same', activation='relu')(layer)
    layer = Flatten()(layer)
    layer = Dense(units=256, activation='relu')(layer)
    layer = Dropout(0.4)(layer)
    layer = Dense(units=5, activation='softmax', name=name)(layer)
    return layer

In [None]:
def googlenet():
    layer_in = Input(shape = (image_resize, image_resize, 3))
    
    # stage-1
    layer = Conv2D(filters=64, kernel_size=(7,7), strides=2, padding='same', activation='relu')(layer_in)
    layer = MaxPooling2D(pool_size=(3,3), strides=2, padding='same')(layer)
    layer = BatchNormalization()(layer)

    # stage-2
    layer = Conv2D(filters=64, kernel_size=(1,1), strides=1, padding='same', activation='relu')(layer)
    layer = Conv2D(filters=192, kernel_size=(3,3), strides=1, padding='same', activation='relu')(layer)
    layer = BatchNormalization()(layer)
    layer = MaxPooling2D(pool_size=(3,3), strides=2, padding='same')(layer)

    # stage-3
    layer = inception(layer, [ 64,  (96,128), (16,32), 32]) #3a
    layer = inception(layer, [128, (128,192), (32,96), 64]) #3b
    layer = MaxPooling2D(pool_size=(3,3), strides=2, padding='same')(layer)
    
    # stage-4
    layer = inception(layer, [192,  (96,208),  (16,48),  64]) #4a
    aux1  = auxiliary(layer, name='aux1')
    layer = inception(layer, [160, (112,224),  (24,64),  64]) #4b
    layer = inception(layer, [128, (128,256),  (24,64),  64]) #4c
    layer = inception(layer, [112, (144,288),  (32,64),  64]) #4d
    aux2  = auxiliary(layer, name='aux2')
    layer = inception(layer, [256, (160,320), (32,128), 128]) #4e
    layer = MaxPooling2D(pool_size=(3,3), strides=2, padding='same')(layer)
    
    # stage-5
    layer = inception(layer, [256, (160,320), (32,128), 128]) #5a
    layer = inception(layer, [384, (192,384), (48,128), 128]) #5b
    layer = AveragePooling2D(pool_size=(3,3), strides=1, padding='valid')(layer)
    
    # stage-6
    layer = Flatten()(layer)
    layer = Dropout(0.4)(layer)
    layer = Dense(units=256, activation='linear')(layer)
    main = Dense(units=5, activation='softmax', name='main')(layer)
    
    model = Model(inputs=layer_in, outputs=[main, aux1, aux2])
    
    return model

model_google = googlenet()

opt_google = keras.optimizers.SGD(learning_rate=0.01)
model_google.compile(loss='categorical_crossentropy', 
                  loss_weights={'main': 1.0, 'aux1': 0.3, 'aux2': 0.3},
                  optimizer=opt_google, metrics=['accuracy'])
model_google.summary()

In [None]:
early_stopping = EarlyStopping(
    monitor='val_main_accuracy', 
    patience=10)

history = model_google.fit(
  training_set, 
  steps_per_epoch = X_train.shape[0] // BATCH_SIZE,
  validation_data = testing_set, 
  epochs = 10, 
  callbacks = [early_stopping],
  verbose = 1
)

# TensorBoard

In [None]:
# imports
import tensorflow as tf
import numpy as np
from sklearn.datasets import make_circles
from matplotlib import pyplot
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
 
# setup tensorboard, directories
!rm -rf ./logs
!mkdir ./logs/
!mkdir ./logs/project
 
log_dir="./logs/project/"
def tensorboard_callback(exp_name):
  return tf.keras.callbacks.TensorBoard(log_dir=log_dir + exp_name, profile_batch=0, histogram_freq=1)
# launch tensorboard with specific directory
%reload_ext tensorboard
%tensorboard --logdir logs/project

# Additional EDA 

In [None]:
# Show an sample image for each disease class
label_0=np.where(labels == 0)
s_0=label_0[0].flatten().tolist()
label_1=np.where(labels == 1)
s_1=label_1[0].flatten().tolist()
label_2=np.where(labels == 2)
s_2=label_2[0].flatten().tolist()
label_3=np.where(labels == 3)
s_3=label_3[0].flatten().tolist()
label_4=np.where(labels == 4)
s_4=label_4[0].flatten().tolist()

### Class 0:

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15,15))
print("Class 0: Cassava Bacterial Blight (CBB)")
for i in range(5):
  random_index = random.choice(s_0) 
  lum_img = train_img[random_index,:,:,:]
  axes[i].imshow(lum_img)

### Class 1

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15,15))
print("Class 1: Cassava Brown Streak Disease (CBSD)")
for i in range(5):
  random_index = random.choice(s_1) 
  lum_img = train_img[random_index,:,:,:]
  axes[i].imshow(lum_img)

### Class 2

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15,15))
print("Class 2: Cassava Green Mottle (CGM) ")
for i in range(5):
  random_index = random.choice(s_2) 
  lum_img = train_img[random_index,:,:,:]
  axes[i].imshow(lum_img)

### Class 3

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15,15))
print("Class 3: Cassava Mosaic Disease (CMD)")
for i in range(5):
  random_index = random.choice(s_3) 
  lum_img = train_img[random_index,:,:,:]
  axes[i].imshow(lum_img)

### Class 4

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15,15))
print("Class 4: Healthy")
for i in range(5):
  random_index = random.choice(s_4) 
  lum_img = train_img[random_index,:,:,:]
  axes[i].imshow(lum_img)