# Import libs

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import shutil
import csv
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
from random import random
from tensorflow.keras.activations import linear
from tensorflow.keras.activations import softmax
from tensorflow.keras.activations import tanh
from tensorflow.keras.layers import BatchNormalization, Activation
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers.experimental.preprocessing import RandomRotation, RandomFlip, Resizing
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Input
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.optimizers import Adam

# Config TPU

In [None]:
# try:
#     tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#     print(f'Running on TPU {tpu.master()}')
# except ValueError:
#     tpu = None

# if tpu:
#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     strategy = tf.distribute.experimental.TPUStrategy(tpu)
# else:
#     strategy = tf.distribute.get_strategy()

# AUTO = tf.data.experimental.AUTOTUNE
# REPLICAS = strategy.num_replicas_in_sync
# print(f'REPLICAS: {REPLICAS}')

# Path variables

In [None]:
origin = "../input/cassava-leaf-disease-classification"
destination = "../output/kaggle/working"
dataset_base_folder = destination + "/dataset"
dataset_train_folder = f"{dataset_base_folder}/train"
if not os.path.exists(dataset_train_folder):
    os.makedirs(dataset_train_folder)
dataset_val_folder = f"{dataset_base_folder}/val"
if not os.path.exists(dataset_train_folder):
    os.makedirs(dataset_train_folder)
destination_classes = [str(i) for i in range(5)]
original_train_data_folder = origin + "/train_images"
resize_train_data_folder = destination + "/train_images_resized"
csv_file = origin + "/train.csv"
sub_df = pd.read_csv(origin + '/sample_submission.csv')
sub_df['paths'] = origin + "/test_images/" + sub_df.image_id
val_train_ratio = 0.2
target_size = (128, 128)
input_shape = (128, 128, 3)

# Clean + Resize + Load dataset

In [None]:
# Clean
for c in destination_classes:
    folder = f'{dataset_train_folder}/{c}'
    if os.path.exists(folder):
        for filename in os.listdir(folder):
            os.remove(f'{folder}/{filename}')
    else:
        os.makedirs(folder)
    folder = f'{dataset_val_folder}/{c}'
    if os.path.exists(folder):
        for filename in os.listdir(folder):
            os.remove(f'{folder}/{filename}')
    else:
        os.makedirs(folder)

# Resize
if not os.path.exists(resize_train_data_folder):
    os.makedirs(resize_train_data_folder)
for filename in os.listdir(original_train_data_folder):
    image = Image.open(original_train_data_folder + "/" + filename)
    image = image.resize(target_size)
    image.save(resize_train_data_folder + '/' + filename)


# Load
filename_to_class_folder = {}

with open(csv_file) as fo:
    for row in csv.reader(fo):
        filename_to_class_folder[row[0]] = row[1]

for filename in os.listdir(resize_train_data_folder):
    destination_folder = dataset_train_folder
    if random() < val_train_ratio:
        destination_folder = dataset_val_folder

    shutil.copyfile(
        f'{resize_train_data_folder}/{filename}',
        f'{destination_folder}/{filename_to_class_folder[filename]}/{filename}'
    )

 # Declare functions

In [None]:
def compute_class_images_count(base_folder: str, class_name: str):
    return sum((1 for _ in os.listdir(f'{base_folder}/{class_name}')))


def compute_all_classes_images_count(base_folder: str):
    return sum((compute_class_images_count(base_folder, c) for c in destination_classes))


def compute_train_images_count():
    return compute_all_classes_images_count(dataset_train_folder)


def compute_val_images_count():
    return compute_all_classes_images_count(dataset_val_folder)

def create_dataset_iterator(base_folder: str, size: int):
    def inner_func():
        return ImageDataGenerator(
            rescale=1.0 / 255,
            preprocessing_function = None,
            rotation_range = 45,
            zoom_range = 0.2,
            horizontal_flip = True,
            vertical_flip = True,
            fill_mode = 'nearest',
            shear_range = 0.1,
            height_shift_range = 0.1,
            width_shift_range = 0.1
        ).flow_from_directory(
                base_folder,
                target_size=target_size,
                batch_size=1)

    return (tf.data.Dataset.from_generator(
        inner_func,
        output_types=(tf.float32, tf.float32),
        output_shapes= ((1, *target_size, 3), (1, len(destination_classes))))
            .take(size)
            .unbatch()
            .batch(batch_size)
            .cache()
            .repeat()
            .prefetch(2)
            .as_numpy_iterator())

# Declare variables

In [None]:
lr = 0.05
ref_batch_size = 1024
batch_size = 256
momentum = 0.95
train_size = compute_train_images_count()
val_size = compute_val_images_count()
epochs = 100
dropout = 0.2
train_labels = pd.read_csv(os.path.join(origin, "train.csv"))
train_labels.label = train_labels.label.astype('str')

# Train models

In [None]:
def build_network_and_train_network(model, name, build_hidden_layers, dataset_train_it, dataset_val_it):
    model.add(tf.keras.layers.Flatten())
    build_hidden_layers(model)
    model.add(tf.keras.layers.Flatten())
    model.add(Dense(len(destination_classes), activation=softmax))
    model.compile(
#         optimizer=SGD(momentum=momentum, lr=lr),
        optimizer=Adam(),
        loss=categorical_crossentropy,
        metrics=[categorical_accuracy]
    )
    result = model.fit(
        dataset_train_it,
        validation_data=dataset_val_it,
        steps_per_epoch=train_size // batch_size,
        validation_steps=val_size // batch_size,
        batch_size=batch_size,
        epochs=epochs
    )

    model.save(name + '.h5')        
    model.summary()  
    return result
def Linear(model):
    pass


def MLP(model):
    model.add(tf.keras.layers.Dense(1024, activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(1024, activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(1024, activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Dropout(dropout))


def CNN(model):
    model.add(tf.keras.layers.Reshape(input_shape))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), padding='same', activation=tf.keras.activations.relu, input_shape=input_shape))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.MaxPool2D(2, 2))
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Conv2D(64, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.MaxPool2D(2, 2))
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(32, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.MaxPool2D(2, 2))
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Conv2D(16, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(16, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.MaxPool2D(2, 2))
    model.add(tf.keras.layers.Dropout(dropout))

    model.add(tf.keras.layers.Conv2D(8, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.Conv2D(8, (3, 3), padding='same', activation=tf.keras.activations.relu))
    model.add(tf.keras.layers.MaxPool2D(2, 2))
    model.add(tf.keras.layers.Dropout(dropout))
    

def VGG16(model):
    CNN(model)
    MLP(model)
    
    
model = tf.keras.models.Sequential()
all_logs = [
#     build_network_and_train_network(model, 'Linear', Linear, create_dataset_iterator(dataset_train_folder, train_size), create_dataset_iterator(dataset_val_folder, val_size)),
#     build_network_and_train_network(model, 'MLP', MLP, create_dataset_iterator(dataset_train_folder, train_size), create_dataset_iterator(dataset_val_folder, val_size)),
#     build_network_and_train_network(model, 'CNN', CNN, create_dataset_iterator(dataset_train_folder, train_size), create_dataset_iterator(dataset_val_folder, val_size))
#     build_network_and_train_network(model, 'CNNMLP', CNNMLP, create_dataset_iterator(dataset_train_folder, train_size), create_dataset_iterator(dataset_val_folder, val_size))
    build_network_and_train_network(model, 'VGG16', VGG16, create_dataset_iterator(dataset_train_folder, train_size), create_dataset_iterator(dataset_val_folder, val_size))
    ]

# Show charts/results 

In [None]:
# acc
for logs in all_logs:
    y_coords = logs.history['categorical_accuracy']
    x_coords = list(range(len(y_coords)))
    plt.plot(x_coords, y_coords)

# val_acc
for logs in all_logs:
    y_coords = logs.history['val_categorical_accuracy']
    x_coords = list(range(len(y_coords)))
    plt.plot(x_coords, y_coords)

plt.grid()
plt.show()

# loss
for logs in all_logs:
    y_coords = logs.history['loss']
    x_coords = list(range(len(y_coords)))
    plt.plot(x_coords, y_coords)

# val_loss
for logs in all_logs:
    y_coords = logs.history['val_loss']
    x_coords = list(range(len(y_coords)))
    plt.plot(x_coords, y_coords)

plt.grid()
plt.show()

# Submit

In [None]:
preds = []
sample_sub = pd.read_csv(origin + '/sample_submission.csv')

for image in sample_sub.image_id:
    img = tf.keras.preprocessing.image.load_img(origin + '/test_images/' + image)
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = tf.keras.preprocessing.image.smart_resize(img, (128, 128))
    img = np.expand_dims(img, 0)
    prediction = model.predict(img)
    preds.append(np.argmax(prediction))

my_submission = pd.DataFrame({'image_id': sample_sub.image_id, 'label': preds})
my_submission.to_csv('submission.csv', index=False)
my_submission