In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
import tensorflow_addons as tfa
from kaggle_datasets import KaggleDatasets

In [None]:
try: # detect TPUs
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    CASSAVA_GCS_PATH = KaggleDatasets().get_gcs_path("cassava-leaf-disease-classification")
    MERGED_DATASET_PATH = KaggleDatasets().get_gcs_path("20192020-merged-tfrecords-512x512")
except ValueError: # no TPU found, detect GPUs
    strategy = tf.distribute.MirroredStrategy()
    CASSAVA_GCS_PATH = "../input/cassava-leaf-disease-classification"
    MERGED_DATASET_PATH = "../input/20192020-merged-tfrecords-512x512"

print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [None]:
train_tfrecords_dir = MERGED_DATASET_PATH + "/*.tfrec"
tfrec_filenames = tf.io.gfile.glob(train_tfrecords_dir)

In [None]:
feature_description = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'image_name': tf.io.FixedLenFeature([], tf.string),
    'target': tf.io.FixedLenFeature([], tf.int64, default_value=0), # using default value for test dataset
}

def _parse_tfrec(example_proto):
    features = tf.io.parse_single_example(example_proto, feature_description)
    
    image = features['image']
    label = features['target']
    image_name = features['image_name']
    
    return image, label, image_name

def _parse_image(image, label, image_name, training=False):
    image = tf.io.decode_image(image)
    image = tf.reshape(image, (512, 512, 3))
    if training:
        image = tfa.image.gaussian_filter2d(image)
    image = tf.cast(image, tf.float32)
    image = tf.keras.applications.inception_v3.preprocess_input(image)
    
    return image, tf.one_hot(label, depth=5)

In [None]:
def get_image_name_labels(image, label, image_name):
    return image_name, label

In [None]:
all_dataset = tf.data.TFRecordDataset(tfrec_filenames).map(_parse_tfrec, num_parallel_calls=tf.data.experimental.AUTOTUNE)
all_dataset = all_dataset.map(get_image_name_labels, num_parallel_calls=tf.data.experimental.AUTOTUNE)
all_dataset

In [None]:
df = pd.DataFrame(
    [(image_name.numpy(),label.numpy()) for image_name, label in all_dataset],
    columns=["image_id", "label"]
)

df2 = pd.read_csv(CASSAVA_GCS_PATH + "/train.csv")

In [None]:
df.shape, df2.shape

In [None]:
df.label.value_counts()

In [None]:
np.setdiff1d(df2.image_id.map(lambda x: bytes(x.strip(".jpg"), "utf-8")).values, df.image_id.values).shape

In [None]:
df_cat_3 = df[df['label']==3]
df_cat_3 = df_cat_3.sample(n=3500, random_state=0)
df_cat_3.shape

In [None]:
df = df[df['label']!=3]
df.label.value_counts()

In [None]:
df = df.append(df_cat_3, ignore_index=True)
df.label.value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)

In [None]:
for train_idx, val_idx in kf.split(df.image_id, df.label):
    train_image_ids = np.array(df.iloc[train_idx]['image_id'])
    val_image_ids = np.array(df.iloc[val_idx]['image_id'])
    break

In [None]:
train_image_ids.shape

In [None]:
val_image_ids.shape

In [None]:
from tensorflow.keras.layers.experimental import preprocessing

data_augmentation = tf.keras.Sequential([
    preprocessing.RandomFlip("horizontal_and_vertical"),
    preprocessing.RandomRotation(0.4),
    preprocessing.RandomContrast(0.3),
    preprocessing.RandomTranslation(0.2, 0.2)
])

In [None]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
BATCH_SIZE

In [None]:
train_dataset = tf.data.TFRecordDataset(tfrec_filenames).map(_parse_tfrec, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.filter(lambda image, label, image_id: tf.reduce_any(tf.equal(image_id, train_image_ids)))

train_dataset = train_dataset.map(_parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
# train_dataset = train_dataset.map(lambda images, labels: (tfa.image.gaussian_filter2d(images), labels),  num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dataset = train_dataset.map(lambda images, labels: (data_augmentation(images, training=True), labels),  num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

train_dataset

In [None]:
valid_dataset = tf.data.TFRecordDataset(tfrec_filenames).map(_parse_tfrec, num_parallel_calls=tf.data.experimental.AUTOTUNE)

valid_dataset = valid_dataset.filter(lambda image, label, image_id: tf.reduce_any(tf.equal(image_id, val_image_ids)))

valid_dataset = valid_dataset.map(_parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

valid_dataset = valid_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

valid_dataset

In [None]:
for images, labels in train_dataset:
    plt.figure(figsize=(10,10))
    for i in range(1,13):
        plt.subplot(3,4,i)
        plt.imshow(images[i])
        plt.axis("off")
    break

In [None]:
with strategy.scope():
    base_model = tf.keras.applications.InceptionV3(
        include_top=False,
        input_shape=(512,512,3),
        weights="imagenet"
    )

    base_model.trainable = True

In [None]:
with strategy.scope():
    inputs = tf.keras.layers.Input([512, 512, 3], dtype=tf.float32)
    x = base_model(inputs)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(1024, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(512, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(5, "softmax")(x)
    
    model = tf.keras.Model(inputs, outputs)


In [None]:
with strategy.scope():
    model.compile(
        optimizer="adam",
        loss="categorical_crossentropy",
        metrics=['accuracy']
    )

In [None]:
model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [None]:
EPOCHS = 100

In [None]:
history = model.fit(train_dataset, validation_data=valid_dataset, epochs=EPOCHS, callbacks=[early_stopping])

In [None]:
history=model.history

In [None]:
plt.plot(history.epoch, history.history['loss'], label="Training Loss")
plt.plot(history.epoch, history.history['val_loss'], label="Validation Loss")
plt.title("Loss Graph")
plt.ylim((0, 2))
plt.legend()
plt.show()

plt.plot(history.epoch, history.history['accuracy'], label="Training Accuracy")
plt.plot(history.epoch, history.history['val_accuracy'], label="Validation Accuracy")
plt.title("Accuracy Graph")
plt.legend()
plt.show()

In [None]:
model.evaluate(valid_dataset)

In [None]:
preds = []
actuals = []
for images, labels in valid_dataset:
    actuals.extend(np.argmax(labels.numpy(), axis=1))

preds.extend(np.argmax(model.predict(valid_dataset), axis=1))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
sns.heatmap(confusion_matrix(actuals, preds), annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

In [None]:
tf.keras.models.save_model(
    model,
    "./model.h5",
    overwrite=True,
    save_format='h5'
)

In [None]:
all_dataset = tf.data.TFRecordDataset(tfrec_filenames).map(_parse_tfrec, num_parallel_calls=tf.data.experimental.AUTOTUNE)

all_dataset = all_dataset.map(_parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

all_dataset = all_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

all_dataset

In [None]:
model.evaluate(all_dataset)