# Introduction

As the second-largest provider of carbohydrates in Africa, cassava is a key food security crop grown by smallholder farmers because it can withstand harsh conditions. At least 80% of household farms in Sub-Saharan Africa grow this starchy root, but viral diseases are major sources of poor yields. With the help of data science, it may be possible to identify common diseases so they can be treated. Our dataset consists of 21,367 labeled images collected during a regular survey in Uganda. Our goal is to classify each cassava image into four disease categories or a fifth category indicating a healthy leaf. With our help, farmers may be able to quickly identify diseased plants, potentially saving their crops before they inflict irreparable damage.

In [None]:
#shutil.rmtree("/kaggle/working/train_data")
#shutil.rmtree("/kaggle/working/valid_data")
#os.remove("/kaggle/working/0")
#os.remove("/kaggle/working/2")
#os.remove("/kaggle/working/3")
#os.remove("/kaggle/working/4")
#os.remove("/kaggle/working/1")

# Set up environment
## Load libraries

In [None]:
# Imports
import pandas as pd 
import os 
import numpy as np 
import shutil 
import math, re, os
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from functools import partial
import tensorflow as tf
print("Tensorflow version " + tf.__version__)
import os, warnings
import matplotlib.pyplot as plt
from matplotlib import gridspec
from tensorflow.keras.preprocessing import image_dataset_from_directory
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
#pip install --upgrade tensorflow

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('image', cmap='magma')
warnings.filterwarnings("ignore") # to clean up output cells

## Set up variables
We'll set up some of our variables for our notebook here.

* Use `KaggleDatasets().get_gcs_path()` to retrieve public GCS paths from a public Kaggle dataset.
* `BATCH_SIZE`: The amount of data included in each sub-epoch weight change.
* `IMAGE_SIZE`: Dimensions of the images in the dataset in pixels.
* `CLASSES`: Four disease categories and a fifth category indicating a healthy leaf.
* `EPOCHS`: The number of times the whole training dataset is passed through the model.

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path()
BATCH_SIZE = 48  #1024
IMAGE_SIZE = [512, 512]
CLASSES = ['0', '1', '2', '3', '4']
EPOCHS = 25

## Create output folders for each of the 5 classes

In [None]:
# Creat training and validation folder
os.mkdir('/kaggle/working/train_data/')
os.mkdir('/kaggle/working/valid_data/')

# Open dataset file 
dataset = pd.read_csv("../input/cassava-leaf-disease-classification/train.csv")

# Split training images in training and validation images
training_data, validation_data = train_test_split(dataset, test_size=0.33)

training_file_names = list(training_data['image_id'].values) 
training_img_labels = list(training_data['label'].values) 
validation_file_names = list(validation_data['image_id'].values) 
validation_img_labels = list(validation_data['label'].values) 

# Create folders of labels
folders_to_be_created = np.unique(list(dataset['label'])) #.values 
source = "../input/cassava-leaf-disease-classification/train_images"
training_destination = '/kaggle/working/train_data'
validation_destination = '/kaggle/working/valid_data'

for new_path in folders_to_be_created: 
    if not os.path.exists(".//" + str(new_path)):
        train_map = os.path.join('/kaggle/working/train_data/', str(new_path))
        valid_map = os.path.join('/kaggle/working/valid_data/', str(new_path))
        os.makedirs(train_map)
        os.makedirs(valid_map)
        
folders = folders_to_be_created.copy() 

for f in range(len(training_file_names)): 
    tr_current_img = training_file_names[f] 
    tr_current_label = training_img_labels[f] 
    src = os.path.join(source, tr_current_img)
    dst = os.path.join(training_destination, str(tr_current_label))
    os.path.exists(dst)
    shutil.copy(src, dst)
    
for f in range(len(validation_file_names)): 
    va_current_img = validation_file_names[f] 
    va_current_label = validation_img_labels[f] 
    src = os.path.join(source, va_current_img)
    dst = os.path.join(validation_destination, str(va_current_label))
    os.path.exists(dst)
    shutil.copy(src, dst)

## Load the data

In [None]:
TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test_tfrecords/ld_test*.tfrec')

# Load training data
TRAIN_PATH = "/kaggle/working/train_data"

ds_train_ = image_dataset_from_directory(
    TRAIN_PATH,
    labels='inferred',
    label_mode='int',
    image_size=[512, 512],
    interpolation='nearest',
    batch_size=1024,
    shuffle=True,
)

# Load validation data
VALID_PATH = "/kaggle/working/valid_data"

ds_valid_ = image_dataset_from_directory(
    VALID_PATH,
    labels='inferred',
    label_mode='int',
    image_size=[512, 512],
    interpolation='nearest',
    batch_size=1024,
    shuffle=True,
)

# Data Pipeline
def convert_to_float(image, label):
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    return image, label

AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = (
    ds_train_
    .map(convert_to_float)
    .cache()
    .prefetch(buffer_size=AUTOTUNE)
)

ds_valid = (
    ds_valid_
    .map(convert_to_float)
    .cache()
    .prefetch(buffer_size=AUTOTUNE)
)

# Brief exploratory data analysis (EDA)

# Building the model

We are using `sparse_categorical_crossentropy` as our loss function, because we did _not_ one-hot encode our labels. The four disease categories and the fifth category indicating a healthy leaf are mutually exclusive (e.g. each image belongs to one of the classes).

## Our own model

In [None]:
type(ds_train)

In [None]:
print(ds_train)

In [None]:
#tfds.as_dataframe(ds_train)

In [None]:
tf.data.Dataset.from_tensor_slices(list(ds_train))

In [None]:
model = keras.Sequential([
    layers.InputLayer(input_shape=[512, 512, 3]),
    
    # Data Augmentation
    # ____,
    #preprocessing.RandomContrast(factor=0.1),
    #preprocessing.RandomFlip(mode='horizontal'), 
    #preprocessing.RandomRotation(factor=0.1),

    # Block One
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=32, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Block Two
    #layers.BatchNormalization(renorm=True),
    #layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same'),
    #layers.MaxPool2D(),

    # Block Three
    #layers.BatchNormalization(renorm=True),
    #layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    #layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    #layers.MaxPool2D(),

    # Head
    layers.BatchNormalization(renorm=True),
    layers.Flatten(),
    layers.Dense(8, activation='relu'),
    layers.Dense(len(CLASSES), activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'], 
)

history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=30,
    verbose=0,
)

In [None]:
# custom filter
def my_filter(shape, dtype=None):

    f = np.array([
            [[[1]], [[0]], [[-1]]],
            [[[1]], [[0]], [[-1]]],
            [[[1]], [[0]], [[-1]]]
        ])
    assert f.shape == shape
    return keras.backend.variable(f, dtype='float32')


model = keras.Sequential([
    layers.InputLayer(input_shape=[512, 512, 3]),
    
    # Data Augmentation
    # ____,
    #preprocessing.RandomContrast(factor=0.1),
    #preprocessing.RandomFlip(mode='horizontal'), 
    #preprocessing.RandomRotation(factor=0.1),

    # Block One
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Block Two
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Block Three
    #layers.BatchNormalization(renorm=True),
    #layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    #layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    #layers.MaxPool2D(),

    # Head
    layers.BatchNormalization(renorm=True),
    layers.Flatten(),
    layers.Dense(8, activation='relu'),
    layers.Dense(len(CLASSES), activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'], 
)

history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=30,
    verbose=0,
)

In [None]:
# Code from kaggle courses
from tensorflow import keras
from tensorflow.keras import layers

# Define a kernel
kernel = tf.constant([ 
    [-1, -1, -1],
    [-1,  8, -1],
    [-1, -1, -1],
])

# Apply convolution
image_filter = tf.nn.conv2d(
    input=ds_train,
    filters=kernel,
    strides=1, # or (1, 1)
    padding='SAME',
)

# Apply ReLU
image_detect = tf.nn.relu(image_filter)

# Apply pooling
image_condense = tf.nn.pool(
    input=image_detect,
    window_shape=(2,2),
    pooling_type='MAX',
    strides=(2, 2),
    padding='SAME',
)

# Compile the model to prepare for training
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'], 
)

# All together
model = keras.Sequential([
    layers.InputLayer(input_shape=[128, 128, 3]),
    
    # Data Augmentation
    # ____,
    preprocessing.RandomContrast(factor=0.1),
    preprocessing.RandomFlip(mode='horizontal'), 
    preprocessing.RandomRotation(factor=0.1),

    # Block One
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Block Two
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Block Three
    layers.BatchNormalization(renorm=True),
    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    layers.Conv2D(filters=256, kernel_size=3, activation='relu', padding='same'),
    layers.MaxPool2D(),

    # Head
    layers.BatchNormalization(renorm=True),
    layers.Flatten(),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid'),
])

history = model.fit(
    ds_train,
    validation_data=ds_valid,
    epochs=30,
    verbose=0,
)

## Model with a pre-trained base (ResNet50)

## Model with a pre-trained base (VGG-16)

## Model with a pre-trained base (InceptionV3)

# Train the model

With model.summary() we'll see a printout of each of our layers, their corresponding shape, as well as the associated number of parameters. Notice that at the bottom of the printout we'll see information on the total parameters, trainable parameters, and non-trainable parameters. Because we're using a pre-trained model, we expect there to be a large number of non-trainable parameters (because the weights have already been assigned in the pre-trained model).

In [None]:
model.summary()

# Evaluating our model

In [None]:
# print out variables available to us
print(history.history.keys())

In [None]:
# create learning curves to evaluate model performance
history_frame = pd.DataFrame(history.history)
history_frame.loc[:, ['loss', 'val_loss']].plot()
history_frame.loc[:, ['sparse_categorical_accuracy', 'val_sparse_categorical_accuracy']].plot();

# Making predictions
Now that we've trained our model we can use it to make predictions! 

In [None]:
# this code will convert our test image data to a float32 
def to_float32(image, label):
    return tf.cast(image, tf.float32), label

In [None]:
test_ds = get_test_dataset(ordered=True) 
test_ds = test_ds.map(to_float32)

print('Computing predictions...')
test_images_ds = testing_dataset
test_images_ds = test_ds.map(lambda image, idnum: image)
probabilities = model.predict(test_images_ds)
predictions = np.argmax(probabilities, axis=-1)
print(predictions)

# Creating a submission file

In [None]:
print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids, predictions]), fmt=['%s', '%d'], delimiter=',', header='id,label', comments='')
!head submission.csv