<a href="https://colab.research.google.com/github/simran0794/vit_plant_village/blob/main/plant_village.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade keras

In [None]:
import os

os.environ["KERAS_BACKEND"] = "jax"  # @param ["tensorflow", "jax", "torch"]

import keras
import tensorflow as tf
import tensorflow_datasets as tfds

from keras import layers
from keras import ops
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [None]:
(ds_train, ds_test), ds_info = tfds.load(
    'plant_village',
    split=['train[:90%]', 'train[90%:]'],
    shuffle_files=True,
    with_info=True,
    as_supervised=True
)

print(ds_info)


In [None]:
learning_rate = 0.001 #0.0001
weight_decay = 0.0001
batch_size = 16 #32
num_epochs = 10
image_size = 72
patch_size = 8
num_patches = (image_size // patch_size) ** 2
projection_dim = 128
num_heads = 4 #6
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 4 #8
mlp_head_units = [
    2048,
    512, #1024
]  # Size of the dense layers of the final classifier


In [None]:
num_classes = ds_info.features['label'].num_classes
print(num_classes)

38


In [None]:
def preprocess(image, label):
    image = tf.image.resize(image, (image_size, image_size)) / 255.0  # Resize and normalize
    return image, label

# Apply the preprocessing function and batch the dataset
ds_train = ds_train.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.batch(32).prefetch(tf.data.experimental.AUTOTUNE)


ds_test = ds_test.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_test = ds_test.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

# Iterate over the dataset and check some samples
for images, labels in ds_train.take(1):
    print(images.shape)  # Should print (32, 128, 128, 3)
    print(labels.numpy())  # Print the labels of the batch

(32, 72, 72, 3)
[35 15 31 20  3  2 19 34 28 31 25 30  6 34 36 37 16 33 29 25 16 32 15 24
 35 37  6 12 15 18 30 15]


In [None]:
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(factor=0.02),
        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
    ],
    name="data_augmentation",
)

In [None]:

def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=keras.activations.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x


In [None]:

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        input_shape = ops.shape(images)
        batch_size = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        channels = input_shape[3]
        num_patches_h = height // self.patch_size
        num_patches_w = width // self.patch_size
        patches = keras.ops.image.extract_patches(images, size=self.patch_size)
        patches = ops.reshape(
            patches,
            (
                batch_size,
                num_patches_h * num_patches_w,
                self.patch_size * self.patch_size * channels,
            ),
        )
        return patches

    def get_config(self):
        config = super().get_config()
        config.update({"patch_size": self.patch_size})
        return config

In [None]:

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.num_patches = num_patches
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = ops.expand_dims(
            ops.arange(start=0, stop=self.num_patches, step=1), axis=0
        )
        projected_patches = self.projection(patch)
        encoded = projected_patches + self.position_embedding(positions)
        return encoded

    def get_config(self):
        config = super().get_config()
        config.update({"num_patches": self.num_patches})
        return config


In [None]:
print(f"Input Shape: {ds_train.take(1).element_spec[0].shape[1:]}")
print(f"Num Classes: {num_classes}")
print(f"Num Pateches: {num_patches}")
print(f"Projection Dim: {projection_dim}")
print(f"Transformer Units: {transformer_units}")
print(f"Transformer Layer: {transformer_layers}")
print(f"MLP Head units: {mlp_head_units}")

Input Shape: (72, 72, 3)
Num Classes: 38
Num Pateches: 81
Projection Dim: 128
Transformer Units: [256, 128]
Transformer Layer: 4
MLP Head units: [2048, 512]


In [None]:

def create_vit_classifier():
    inputs = keras.Input(shape=ds_train.take(1).element_spec[0].shape[1:])
    # Augment data.
    augmented = data_augmentation(inputs)
    # Create patches.
    patches = Patches(patch_size)(augmented)
    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        # Layer normalization 1.
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
        )(x1, x1)
        # Skip connection 1.
        x2 = layers.Add()([attention_output, encoded_patches])
        # Layer normalization 2.
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        # MLP.
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        # Skip connection 2.
        encoded_patches = layers.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.Flatten()(representation)
    representation = layers.Dropout(0.5)(representation)
    # Add MLP.
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    # Classify outputs.
    logits = layers.Dense(num_classes)(features)
    # Create the Keras model.
    model = keras.Model(inputs=inputs, outputs=logits)
    return model


In [None]:

def run_experiment(model):
    optimizer = keras.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[
            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
        ],
    )

    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        checkpoint_filepath,
        monitor="val_accuracy",
        save_best_only=True,
        save_weights_only=True,
    )

    history = model.fit(
        ds_train,
        validation_data=ds_test,
        batch_size=batch_size,
        epochs=num_epochs,
        callbacks=[checkpoint_callback],
    )

    return history


vit_classifier = create_vit_classifier()
history = run_experiment(vit_classifier)


def plot_history(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_history("loss")
# plot_history("top-5-accuracy")

NameError: name 'create_vit_classifier' is not defined

Epoch 1/10
1528/1528 ━━━━━━━━━━━━━━━━━━━━ 2770s 2s/step
 - accuracy: 0.2025 - loss: 3.2822 - top-5-accuracy: 0.4376 - val_accuracy: 0.5796 - val_loss: 1.5180 - val_top-5-accuracy: 0.8494

Epoch 2/10
1528/1528 ━━━━━━━━━━━━━━━━━━━━ 2777s 2s/step
 - accuracy: 0.5623 - loss: 1.5297 - top-5-accuracy: 0.8551 - val_accuracy: 0.7656 - val_loss: 0.7798 - val_top-5-accuracy: 0.9560

Epoch 3/10
1528/1528 ━━━━━━━━━━━━━━━━━━━━ 0s 2s/step
- accuracy: 0.7046 - loss: 1.0034 - top-5-accuracy: 0.9339


learning_rate = 0.0001
weight_decay = 0.0001
batch_size = 32
num_epochs = 10  # For real training, use num_epochs=100. 10 is a test value
image_size = 72  # We'll resize input images to this size
patch_size = 8  # Size of the patches to be extract from the input images
num_patches = (image_size // patch_size) ** 2
projection_dim = 128
num_heads = 6
transformer_units = [
    projection_dim * 2,
    projection_dim,
]  # Size of the transformer layers
transformer_layers = 8
mlp_head_units = [
    2048,
    1024,
]  # Size of the dense layers of the final classifier
