In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# A baseline model based on VGG16
# Group members: Bangxi Xiao, Daxin Niu, Wendy Huai
# Contact: bangxi_xiao@brown.edu, daxin_niu@brown.edu, zuxuan_huai@brown.edu

# This file implements a baseline model for cassava leaf disease classification

import time
import pickle
import numpy as np
import pandas as pd
import shutil, os
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPool2D, Input
import json
from scipy import stats
from tensorflow.keras import Model, Sequential
from tensorflow.keras.regularizers import l1
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.applications import VGG16
from matplotlib import pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import seaborn as sns
from sklearn.model_selection import train_test_split
from PIL import Image, ImageStat
from skimage import io, color

# Loading the training data
train_raw = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/train.csv', encoding='utf_8_sig', engine='python')
print(train_raw.head())


In [None]:
_batch_size = 32
_image_width = 160
_image_height = 120


# def file_move(file_names, labels, train=True, pic_file='D:\\proj\\cassava-leaf-disease-classification\\train_images\\'):
#     if train:
#         front = 'train\\'
#     else:
#         front = 'test\\'
#     for fn, label in zip(file_names, labels):
#         if os.path.exists(pic_file + front + str(label)):
#             shutil.move(pic_file + fn, pic_file + front + str(label))
#         else:
#             os.makedirs(pic_file + front + str(label))
#             shutil.move(pic_file + fn, pic_file + front + str(label))
#     print('DONE.')


train_raw['label'] = train_raw['label'].astype(str)
x_train, x_test, y_train, y_test = train_test_split(train_raw['image_id'], train_raw['label'], test_size=0.2)
xy_train = pd.DataFrame({'x': x_train, 'y': y_train})
xy_test = pd.DataFrame({'x': x_test, 'y': y_test})

# file_move(x_train.tolist(), y_train.tolist(), True)
# file_move(x_test.tolist(), y_test.tolist(), False)

train_data_gen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)


train_generator = train_data_gen.flow_from_dataframe(dataframe=train_raw,
                                                     directory='/kaggle/input/cassava-leaf-disease-classification/train_images',
                                                     subset='training',
                                                     x_col='image_id',
                                                     y_col='label',
                                                     shuffle=True,
                                                     target_size=(160, 120),
                                                     batch_size=32,
                                                     class_mode='categorical')


validation_generator = train_data_gen.flow_from_dataframe(dataframe=train_raw,
                                                          directory='/kaggle/input/cassava-leaf-disease-classification/train_images',
                                                          subset='validation',
                                                          x_col='image_id',
                                                          y_col='label',
                                                          shuffle=True,
                                                          target_size=(160, 120),
                                                          batch_size=32,
                                                          class_mode='categorical')


def tb_callback(exp_name):
    return TensorBoard(log_dir=_log_dir + exp_name, profile_batch=0, histogram_freq=1)


def build_baseline_vgg():
    input_shape = (160, 120, 3)
    baseline_model = VGG16(weights=None, include_top=False, input_shape=input_shape)
    x = baseline_model.output
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(rate=0.25)(x)

    x = Dense(256, activation='relu')(x)
    x = Dropout(rate=0.25)(x)

    x = Dense(128, activation='relu')(x)
    x = Dropout(rate=0.25)(x)

    predictions = Dense(5, activation='softmax')(x)
    model = Model(inputs=baseline_model.input, outputs=predictions)
    model.compile(optimizer=_opt,
                  loss=_loss,
                  metrics=_metrics)
    
    print(model.summary())
    
    return model

In [None]:
_shuffle = True

_log_dir = '/kaggle/ouput/logs/baseline_model/'
_seed = 27
_learning_rate = 0.0001
_schedule = ExponentialDecay(_learning_rate, decay_steps=10_0000, decay_rate=0.96)
_opt = Adam(learning_rate=_schedule)
_es = EarlyStopping(monitor='val_accuracy', patience=20)
_tb = tb_callback('Baseline_model_1')
_callbacks = [_es, _tb]
_metrics = ['accuracy']
_loss = 'categorical_crossentropy'
_epochs = 4


baseline_model1 = build_baseline_vgg()
baseline_model1_hist = baseline_model1.fit(train_generator,
                                           epochs=_epochs,
                                           validation_data=validation_generator,
                                           callbacks=_callbacks,
                                           shuffle=_shuffle)


In [None]:
# save model
from keras.models import load_model
import os
def save_model(model, name):
  model_name = '{}.h5'.format(name)
  save_dir = os.path.join(os.getcwd(), 'saved_models')
  
  # Save model and weights
  if not os.path.isdir(save_dir):
      os.makedirs(save_dir)
  model_path = os.path.join(save_dir, model_name)
  model.save(model_path)
  print('Saved trained model at %s ' % model_path)

In [None]:
save_model(baseline_model1, 'baseline_model1')

In [None]:
# Make sure to save the model you trained to /kaggle/working! 
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import smart_resize
## Load model. Use "load_weights" if you only save your model weights.
model = keras.models.load_model("./saved_models/baseline_model1.h5")

preds = []
sample_sub = pd.read_csv('/kaggle/input/cassava-leaf-disease-classification/sample_submission.csv')

sample_sub.head()

In [None]:
for image in sample_sub.image_id:
    img = keras.preprocessing.image.load_img('/kaggle/input/cassava-leaf-disease-classification/test_images/' + image)
    #
    # Preprocess image here (rescale, etc. - you might need to use parameters you determined during training)
    #
    img = img_to_array(img)
    img = smart_resize(img, (160, 120))
    img = tf.reshape(img, (-1, 160, 120, 3))
    
    # Now apply your model and save your prediction:
    prediction = model.predict(img)
    
    preds.append(np.argmax(prediction))

my_submission = pd.DataFrame({'image_id': sample_sub.image_id, 'label': preds})
my_submission.to_csv('/kaggle/working/submission.csv', index=False)