In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import os

# Ensure correct image size
image_size = 128  # Set to match ViT input
patch_size = 16
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
transformer_units = [projection_dim * 2, projection_dim]
transformer_layers = 8
mlp_head_units = [2048, 1024]  # MLP head layer sizes

# Define Data Paths
train_image_path = "/kaggle/input/fppd03/train"
test_image_path = '/kaggle/input/fppd03/test'
valid_image_path= "/kaggle/input/fppd03/val"

# Get class names
class_names = os.listdir(train_image_path)
num_classes = len(class_names)

# Data Augmentation and Data Generators
from tensorflow.keras.preprocessing.image import ImageDataGenerator

batch_size = 32

train_datagen = ImageDataGenerator(rescale=1.0/255,
                                   zoom_range=0.2,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
    train_image_path,
    target_size=(image_size, image_size),  # Fixed shape
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True)

test_datagen = ImageDataGenerator(rescale=1.0/255)

test_data_generator = test_datagen.flow_from_directory(
    test_image_path,
    target_size=(image_size, image_size),  # Fixed shape
    batch_size=batch_size,
    class_mode="categorical",
    color_mode="rgb",
    shuffle=False)

validation_datagen = ImageDataGenerator(rescale=1.0/255)

validation_generator = validation_datagen.flow_from_directory(
    valid_image_path,
    target_size=(image_size, image_size),  # Fixed shape
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True)


# Vision Transformer Components
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

class Patches(layers.Layer):
    def __init__(self, patch_size):
        super().__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID",
        )
        patch_dim = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dim])
        return patches

class PatchEncoder(layers.Layer):
    def __init__(self, num_patches, projection_dim):
        super().__init__()
        self.projection = layers.Dense(units=projection_dim)
        self.position_embedding = layers.Embedding(
            input_dim=num_patches, output_dim=projection_dim
        )

    def call(self, patch):
        positions = tf.range(start=0, limit=tf.shape(patch)[1], delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

# Build the Vision Transformer Model
def create_vit_classifier():
    inputs = layers.Input(shape=(image_size, image_size, 3))
    patches = Patches(patch_size)(inputs)
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    for _ in range(transformer_layers):
        x1 = layers.LayerNormalization()(encoded_patches)
        attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(x1, x1)
        x2 = layers.Add()([attention_output, encoded_patches])
        x3 = layers.LayerNormalization()(x2)
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        encoded_patches = layers.Add()([x3, x2])

    representation = layers.LayerNormalization()(encoded_patches)
    representation = layers.GlobalAveragePooling1D()(representation)  # Fixed Flatten() issue
    representation = layers.Dropout(0.5)(representation)
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    outputs = layers.Dense(num_classes, activation="softmax")(features)

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Compile the ViT model
vit_model = create_vit_classifier()
vit_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=["accuracy"]
)

# Train the ViT model
history = vit_model.fit(
    train_generator,
    validation_data=validation_generator,
    epochs=500,
    batch_size=batch_size
)

2025-04-27 01:33:47.427315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745717627.677739      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745717627.753413      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Found 1630 images belonging to 19 classes.
Found 2362 images belonging to 19 classes.
Found 2362 images belonging to 19 classes.


I0000 00:00:1745717641.696144      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745717641.696844      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/500


  self._warn_if_super_not_called()
I0000 00:00:1745717676.676740     113 service.cc:148] XLA service 0x783f3c002d70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745717676.677928     113 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1745717676.677949     113 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1745717679.902110     113 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/51[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m42:44[0m 51s/step - accuracy: 0.0000e+00 - loss: 3.0022

I0000 00:00:1745717695.484180     113 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 833ms/step - accuracy: 0.0962 - loss: 2.8271 - val_accuracy: 0.1283 - val_loss: 2.8165
Epoch 2/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 210ms/step - accuracy: 0.1492 - loss: 2.6987 - val_accuracy: 0.1655 - val_loss: 2.7843
Epoch 3/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 206ms/step - accuracy: 0.1845 - loss: 2.6108 - val_accuracy: 0.1770 - val_loss: 2.7320
Epoch 4/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 204ms/step - accuracy: 0.1905 - loss: 2.5962 - val_accuracy: 0.1685 - val_loss: 2.7136
Epoch 5/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 201ms/step - accuracy: 0.2157 - loss: 2.5715 - val_accuracy: 0.1969 - val_loss: 2.6686
Epoch 6/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 207ms/step - accuracy: 0.2187 - loss: 2.4832 - val_accuracy: 0.1956 - val_loss: 2.6296
Epoch 7/500
[1m51/51[0m [

In [2]:
# Evaluate the model on the test data
test_loss, test_accuracy = vit_model.evaluate(test_data_generator)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 205ms/step - accuracy: 0.7340 - loss: 1.9876
Test Accuracy: 74.85%


# 