# This notebook demonstrate use of TF Reocords because TPU training is optimized with TF Record input pipe line

# Content
1. Setup
2. Data processing
3. Visualize sample batch
4. Build and train model
5. Make predictions
6. Tips for improvement


# Setup

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
AUTOTUNE = tf.data.experimental.AUTOTUNE
from kaggle_datasets import KaggleDatasets
import json
from functools import partial
print("TensorFlow version: ", tf.__version__)

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Device:", tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('#Replicas: ', strategy.num_replicas_in_sync)

In [None]:
train_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
train_df.head()

In [None]:
n0, n1, n2, n3, n4 = np.bincount(train_df['label'])

In [None]:
total = train_df.shape[0]
total

In [None]:
train_df['label'].value_counts()

In [None]:
GCS_PATH = KaggleDatasets().get_gcs_path()

# Data processing

In [None]:
BATCH_SIZE = 64
IMAGE_SIZE = [512, 512]

In [None]:
FILENAMES = tf.io.gfile.glob(GCS_PATH + "/train_tfrecords/*.tfrec")
train_split = int(0.7 * len(FILENAMES))
val_split = int(0.2 * len(FILENAMES))
TRAINING_FILENAMES, VAL_FILENAMES, EVAL_FILENAMES = FILENAMES[:train_split], FILENAMES[train_split:train_split+val_split],FILENAMES[train_split+val_split:]

TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + "/test_tfrecords/*.tfrec")
print("Total labeled files: ",len(FILENAMES))
print("Train TFRecord Files:", len(TRAINING_FILENAMES))
print("Validation TFRecord Files:", len(VAL_FILENAMES))
print("Evalution TFRecord Files:", len(EVAL_FILENAMES))
print("Test TFRecord Files:", len(TEST_FILENAMES))

## Get order of IDs of test data, so that we can create submission file accordingly

In [None]:
TEST_FILENAMES

In [None]:
test_raw_dataset = tf.data.TFRecordDataset(TEST_FILENAMES)
test_image_feature_description = {

    "image": tf.io.FixedLenFeature([],tf.string),
    "image_name": tf.io.FixedLenFeature([],tf.string)
}

#read below comment to know about feature description

def _parse_image_function_test(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, test_image_feature_description)

test_parsed_image_dataset = test_raw_dataset.map(_parse_image_function_test)

To know, how one can get/ know feature description, have a loot at this [notebook](https://www.kaggle.com/senkmp/simple-eda-using-tf-records) https://www.kaggle.com/senkmp/simple-eda-using-tf-records

In [None]:
ids = []
for image_features in test_parsed_image_dataset:
  id = image_features['image_name'].numpy().decode("utf-8") 
  ids.append(id)
len(ids)

## Input Pipe line

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32)
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    return image

def read_tfrecord(example, labeled):
    tfrecord_format = (
        {
        "image": tf.io.FixedLenFeature([],tf.string),
        "image_name": tf.io.FixedLenFeature([],tf.string),
        "target": tf.io.FixedLenFeature([],tf.int64)
        }
        if labeled
        else { "image": tf.io.FixedLenFeature([],tf.string),
                "image_name": tf.io.FixedLenFeature([],tf.string),}
    )
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example["image"])
    if labeled:
        label = [tf.cast(example["target"], tf.int64)]
        return image, label
    return image

In [None]:
def load_dataset(filenames, labeled=True,order = True):
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = not order  # disable order, increase speed
    dataset = tf.data.TFRecordDataset(filenames)  # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order)  # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTOTUNE)
    # returns a dataset of (image, label) pairs if labeled=True or just images if labeled=False
    return dataset

def get_dataset(filenames, labeled=True,shuffle=True,order=True):
    
    dataset = load_dataset(filenames, labeled=labeled,order=order)
    if shuffle:
        dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    return dataset

In [None]:
train_dataset = get_dataset(TRAINING_FILENAMES)
val_dataset = get_dataset(VAL_FILENAMES)
eval_dataset = get_dataset(EVAL_FILENAMES)

In [None]:
test_dataset = get_dataset(TEST_FILENAMES, labeled=False,shuffle=False,order=False)


# Visualize sample batch

In [None]:
labels_file = '../input/cassava-leaf-disease-classification/label_num_to_disease_map.json'
f= open(labels_file)
labels = json.load(f)
f.close()
labels

In [None]:
train_dataset

In [None]:
image_batch, label_batch = next(iter(train_dataset))


In [None]:
def show_batch(image_batch, label_batch):
    plt.figure(figsize=(15, 15))
    for n in range(25):
        ax = plt.subplot(5, 5, n + 1)
        plt.imshow(image_batch[n] / 255.0)
        t = label_batch[n]
        
        title = labels[str(t[0])]
        plt.title(title)
        plt.axis("off")
    plt.tight_layout()



show_batch(image_batch.numpy(), label_batch.numpy())

# Build and train model

## Give them equal rights :)

In [None]:
weight_for_0 = (1/n0)*total/5
weight_for_1 = (1/n1)*total/5
weight_for_2 = (1/n2)*total/5
weight_for_3 = (1/n3)*total/5
weight_for_4 = (1/n4)*total/5
print(weight_for_0,weight_for_1,weight_for_2,weight_for_3,weight_for_4)

In [None]:
class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2, 3: weight_for_3, 4: weight_for_4}

In [None]:

checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', save_best_only=True, monitor='val_loss', mode='min')

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=10, restore_best_weights=True
)

In [None]:
  def make_model1():
    base_model = tf.keras.applications.MobileNet(
        input_shape=(*IMAGE_SIZE, 3), include_top=False, weights="imagenet"
    )

    base_model.trainable = False

    inputs = tf.keras.layers.Input([*IMAGE_SIZE, 3])
    
    x = base_model(inputs, training=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    
   
    outputs = tf.keras.layers.Dense(5, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    # used sparse_categorical_crossentropy instead of categorical_crossentropy because output is not one hot vector
    # categorical_accuracy == accuracy for this data

    return model

In [None]:
'''import efficientnet.efficientnet.tfkeras as efn 
  def make_model():
    base_model = efn.EfficientNetB7(
        input_shape=(*IMAGE_SIZE, 3), include_top=False, weights="imagenet"
    )

    base_model.trainable = False

    inputs = tf.keras.layers.Input([*IMAGE_SIZE, 3])
    
    x = base_model(inputs, training=False)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    
   
    outputs = tf.keras.layers.Dense(5, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    # used sparse_categorical_crossentropy instead of categorical_crossentropy because output is not one hot vector
    # categorical_accuracy == accuracy for this data

    return model'''

In [None]:
with strategy.scope():
    model = make_model1()


In [None]:
history = model.fit(
    train_dataset,
    epochs=100,
    validation_data=val_dataset,
    callbacks=[early_stopping_cb, checkpoint],
    class_weight=class_weight
)

In [None]:
model.evaluate(eval_dataset)

# Make predictions

In [None]:
pred = model.predict(test_dataset)
pred

In [None]:
pred_classes  = pred.argmax(axis=1)

In [None]:
sample_df = pd.read_csv('../input/cassava-leaf-disease-classification/train.csv')
sample_df.head()

In [None]:
submission_df = pd.DataFrame()
submission_df['image_id'] = ids
submission_df['label'] = pred_classes
submission_df.head()


In [None]:
submission_df.to_csv('submission.csv',index=False)

# Tips for improvement
1. Thing about imbalanced classes
2. Data augmentation
3. Better model and fine tuning


**Let me know in comment section if this is helpful ⬆️**