<a href="https://colab.research.google.com/github/shishir-py/Classification-using-Deep-learning/blob/main/image_captioning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

from PIL import Image

In [None]:
seed = 111
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
IMAGE_SIZE = (299, 299)
VOCAB_SIZE = 10000
seq_length = 25
EMBED_DIM = 512
FF_DIM = 512
BATCH_SIZE = 64
EPOCHS = 30
AUTOTUNE = tf.data.AUTOTUNE
seed = 42
tf.random.set_seed(seed)
image_size = (500, 375)  # The target image size


In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf

captions_file="/content/drive/MyDrive/Laptop/Image captioning project/archive/Copy of translated_nepali_captions.txt"
images_folder="/content/drive/MyDrive/Laptop/Image captioning project/archive/Images"


In [None]:
import os
import numpy as np

def load_captions_data(filename, seq_length=20):

    caption_mapping = {}
    text_data = []

    with open(filename, "r", encoding="utf-8") as caption_file:
        caption_lines = caption_file.readlines()

    for line in caption_lines:
        line = line.rstrip("\n")
        # Image name and captions are separated using a tab
        img_name, caption = line.split("#", maxsplit=1)

        # Each image is repeated five times for the five different captions.
        # Each image name has a suffix `#(caption_number)`
        img_name = img_name.split("#")[0]
        img_name = os.path.join(images_folder, img_name.strip())

        # We will remove captions that are either too short or too long
        tokens = caption.strip().split()

        if len(tokens) < 5 or len(tokens) > seq_length:
            continue

        if img_name.endswith(".jpg"):
            # We will add a start and an end token to each caption
            caption = "<start> " + caption.strip() + " <end>"
            text_data.append(caption)

            if img_name in caption_mapping:
                caption_mapping[img_name].append(caption)
            else:
                caption_mapping[img_name] = [caption]

    return caption_mapping, text_data


def train_val_split(caption_data, train_size=0.8, shuffle=True):

    all_images = list(caption_data.keys())

    if shuffle:
        np.random.shuffle(all_images)

    train_size = int(len(caption_data) * train_size)

    train_data = {img_name: caption_data[img_name] for img_name in all_images[:train_size]}
    valid_data = {img_name: caption_data[img_name] for img_name in all_images[train_size:]}

    return train_data, valid_data




# Load the dataset
caption_mapping, text_data = load_captions_data(captions_file)

# Split the dataset into training and validation sets
train_data, valid_data = train_val_split(caption_mapping)

# Print the number of samples in the training and validation sets
print("Number of training samples:", len(train_data))
print("Number of validation samples:", len(valid_data))


Number of training samples: 6472
Number of validation samples: 1619


In [None]:
train_data

{'/content/drive/MyDrive/Laptop/Image captioning project/archive/Images/1423126855_6cd2a3956c.jpg': ['<start> 0 हरियो मैदानमा भेडाहरू लिएर एक पदयात्री पोज। <end>',
  '<start> 1 झोला बोकेको मानिस घाँसको ठूलो मैदानमा हिंड्दैछ। <end>',
  '<start> 2 झोला लगाएको खेतमा एकजना मानिस र उसको पछाडि भेडा र पहाडहरू छन्। <end>',
  '<start> 3 पैदल यात्राको झोला लगाएको मान्छे काठको पोलमा झुकेको छ <end>',
  '<start> 4 कालो शर्ट लगाएको मान्छे घाँसे मैदानमा पोलको छेउमा उभिरहेको छ। <end>'],
 '/content/drive/MyDrive/Laptop/Image captioning project/archive/Images/3606084228_6286a52875.jpg': ['<start> 0 एक साइकल यात्रीले आफ्नो बाइक हाम फालेको बेला पहाडको छेउमा फैलिएको भीडले हेरिरहेको छ। <end>',
  '<start> 1 एक डर्ट बाइकरले प्रतिस्पर्धा गर्दा दर्शकहरूले हेर्छन्। <end>',
  '<start> 2 सेतो ज्याकेट र हेलमेट लगाएको मानिसले आफ्नो बाइकमा चाल चलाउँछ। <end>',
  '<start> 3 एक जना मानिसले फोहोर साइकलमा मिडएयर स्टन्ट प्रदर्शन गर्दैछ जबकि पृष्ठभूमिमा मानिसहरू हेरिरहेका छन्। <end>',
  '<start> 4 माउन्टेन बाइक रेसरले आफ्नो

In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from PIL import Image
from tensorflow.keras.layers.experimental import preprocessing
import re
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# Step 1: Tokenization

def tokenize_captions(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

train_captions = [caption for captions in train_data.values() for caption in captions]
tokenizer = tokenize_captions(train_captions)

In [None]:
# Step 2: Define Preprocessing Functions for Text
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")

strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
strip_chars = strip_chars.replace("<", "")
strip_chars = strip_chars.replace(">", "")

VOCAB_SIZE = 10000
seq_length = 20

vectorization = preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=seq_length,
    standardize=custom_standardization,
)
vectorization.adapt(text_data)



In [None]:
# Step 3: Image Preprocessing

def preprocess_image(image_path, target_size):
    try:
        image = tf.io.read_file(image_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, target_size)
        image = tf.keras.applications.resnet.preprocess_input(image)  # Apply ResNet preprocessing
        return image
    except tf.errors.NotFoundError:
        # Handle missing image file
        print("Image file not found:", image_path)
        return None

def load_image_features(image_caption_mapping, target_size):
    image_features = []
    for image_path in image_caption_mapping.keys():
        image = preprocess_image(image_path, target_size)
        if image is not None:
            image_features.append(image)
    return tf.stack(image_features)


target_image_size = (224, 224)
train_image_features = load_image_features(train_data, target_image_size)
valid_image_features = load_image_features(valid_data, target_image_size)


In [None]:
def extract_patches(images, patch_size, stride):
    patches = tf.image.extract_patches(
        images=images,
        sizes=[1, patch_size, patch_size, 1],
        strides=[1, stride, stride, 1],
        rates=[1, 1, 1, 1],
        padding='VALID'
    )
    return patches

def preprocess_patches(patches, target_size):
    patches = tf.reshape(patches, shape=[-1, target_size, target_size, patches.shape[-1]])
    patches = tf.image.resize(patches, size=(target_size, target_size))
    return patches

def compute_patch_embeddings(patches, embedding_dim):
    model = keras.applications.ResNet50(include_top=False, weights='imagenet')
    features = model(patches)
    embeddings = layers.Flatten()(features)
    embeddings = layers.Dense(embedding_dim)(embeddings)
    return embeddings

def add_positional_embedding(embeddings, num_patches):
    position_embeddings = layers.Embedding(input_dim=num_patches, output_dim=embedding_dim)(tf.range(num_patches))
    embeddings_with_position = embeddings + position_embeddings
    return embeddings_with_position

# Step 1: Image Preprocessing
patch_size = 16
stride = 4
target_size = 224
embedding_dim = 256

image_patches = extract_patches(train_image_features, patch_size, stride)
preprocessed_patches = preprocess_patches(image_patches, target_size)
patch_embeddings = compute_patch_embeddings(preprocessed_patches, embedding_dim)
image_embeddings = add_positional_embedding(patch_embeddings, preprocessed_patches.shape[0])



In [None]:
# Step 2: Text Preprocessing

def perform_word_embedding(text_data, vocab_size, embedding_dim):
    vectorization = preprocessing.TextVectorization(
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=seq_length,
        standardize=custom_standardization
    )
    vectorization.adapt(text_data)

    model = keras.Sequential([
        layers.Embedding(vocab_size, embedding_dim),
        # Add more layers as needed
    ])
    embeddings = model(vectorization(text_data))
    return embeddings

# Step 2: Text Preprocessing
vocab_size = 10000
seq_length = 20
embedding_dim = 256

text_embeddings = perform_word_embedding(text_data, vocab_size, embedding_dim)



---



---



In [None]:
# Step 1: Image Preprocessing

# Define the Transformer Encoder
class ImageTransformerEncoder(layers.Layer):
    def __init__(self, num_patches, embedding_dim, num_heads, ff_dim, dropout_rate=0.1):
        super(ImageTransformerEncoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate

        self.positional_embedding = layers.Embedding(input_dim=num_patches, output_dim=embedding_dim)
        self.patch_embedding = layers.Dense(embedding_dim)
        self.dropout = layers.Dropout(dropout_rate)
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.add_norm_1 = layers.AddNormalization()
        self.feed_forward = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embedding_dim),
        ])
        self.add_norm_2 = layers.AddNormalization()

    def call(self, patch_embeddings):
        positions = tf.range(patch_embeddings.shape[1])
        positional_embeddings = self.positional_embedding(positions)
        embeddings = self.patch_embedding(patch_embeddings) + positional_embeddings
        embeddings = self.dropout(embeddings)

        self_attention = self.attention(embeddings, embeddings)
        embeddings = self.add_norm_1(embeddings, self_attention)
        ff_output = self.feed_forward(embeddings)
        encoded_embeddings = self.add_norm_2(embeddings, ff_output)

        return encoded_embeddings

# Define the Transformer Decoder
class TextTransformerDecoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, num_heads, ff_dim, seq_length, dropout_rate=0.1):
        super(TextTransformerDecoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.seq_length = seq_length
        self.dropout_rate = dropout_rate

        self.positional_embedding = layers.Embedding(input_dim=seq_length, output_dim=embedding_dim)
        self.word_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.dropout = layers.Dropout(dropout_rate)
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.add_norm_1 = layers.AddNormalization()
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
        self.add_norm_2 = layers.AddNormalization()
        self.feed_forward = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embedding_dim),
        ])
        self.add_norm_3 = layers.AddNormalization()
        self.linear = layers.Dense(vocab_size)

    def call(self, encoded_embeddings, text_embeddings):
        positions = tf.range(text_embeddings.shape[1])
        positional_embeddings = self.positional_embedding(positions)
        embeddings = self.word_embedding(text_embeddings) + positional_embeddings
        embeddings = self.dropout(embeddings)

        self_attention = self.attention_1(embeddings, embeddings)
        embeddings = self.add_norm_1(embeddings, self_attention)

        cross_attention = self.attention_2(embeddings, encoded_embeddings)
        embeddings = self.add_norm_2(embeddings, cross_attention)

        ff_output = self.feed_forward(embeddings)
        output_embeddings = self.add_norm_3(embeddings, ff_output)

        logits = self.linear(output_embeddings)

        return logits

# Step 1: Image Preprocessing
patch_size = 16
stride = 4
target_size = 224
embedding_dim = 256

image_patches = extract_patches(train_image_features, patch_size, stride)
preprocessed_patches = preprocess_patches(image_patches, target_size)
patch_embeddings = compute_patch_embeddings(preprocessed_patches, embedding_dim)
image_embeddings = add_positional_embedding(patch_embeddings, preprocessed_patches.shape[0])

# Step 2: Text Preprocessing
vocab_size = 10000
seq_length = 20
embedding_dim = 256

text_embeddings = perform_word_embedding(text_data, vocab_size, embedding_dim)

# Step 3: Define the Transformer Encoder and Decoder
num_patches = preprocessed_patches.shape[1]
num_heads = 8
ff_dim = 512
dropout_rate = 0.1

image_encoder = ImageTransformerEncoder(num_patches, embedding_dim, num_heads, ff_dim, dropout_rate)
text_decoder = TextTransformerDecoder(vocab_size, embedding_dim, num_heads, ff_dim, seq_length, dropout_rate)

# Step 4: Encode Images and Decode Text
encoded_image_embeddings = image_encoder(image_embeddings)
decoded_logits = text_decoder(encoded_image_embeddings, text_embeddings)




---



---



In [None]:
# Step 5: Define Loss Function and Metrics
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()
accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy()

# Step 6: Define the Training Step
@tf.function
def train_step(image_embeddings, text_embeddings, captions):
    with tf.GradientTape() as tape:
        encoded_image_embeddings = image_encoder(image_embeddings, training=True)
        logits = text_decoder(encoded_image_embeddings, text_embeddings, training=True)
        loss_value = loss_object(captions, logits)

    gradients = tape.gradient(loss_value, image_encoder.trainable_variables + text_decoder.trainable_variables)
    optimizer.apply_gradients(zip(gradients, image_encoder.trainable_variables + text_decoder.trainable_variables))

    accuracy_metric(captions, logits)

    return loss_value

# Step 7: Training Loop
num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss = 0.0
    accuracy_metric.reset_states()

    for batch, (image_embeddings, text_embeddings, captions) in enumerate(train_features):
        loss = train_step(image_embeddings, text_embeddings, captions)
        epoch_loss += loss

        if batch % 100 == 0:
            print(f"Epoch {epoch + 1} Batch {batch} Loss {loss.numpy():.4f} Accuracy {accuracy_metric.result().numpy():.4f}")

    print(f"Epoch {epoch + 1} Loss {epoch_loss / (batch + 1):.4f} Accuracy {accuracy_metric.result().numpy():.4f}")

# Step 8: Evaluation
def evaluate(image_embeddings, text_embeddings):
    encoded_image_embeddings = image_encoder(image_embeddings, training=False)
    logits = text_decoder(encoded_image_embeddings, text_embeddings, training=False)
    predicted_captions = tf.argmax(logits, axis=2)
    return predicted_captions

predicted_captions = evaluate(valid_image_features, valid_captions)

# Step 9: Generate Captions
def generate_captions(image_paths):
    image_features = load_image_features(image_paths, target_image_size)
    encoded_image_embeddings = image_encoder(image_features, training=False)
    text_embeddings = perform_word_embedding(["<start>"] * len(image_paths), vocab_size, embedding_dim)
    captions = []

    for _ in range(max_caption_length):
        logits = text_decoder(encoded_image_embeddings, text_embeddings, training=False)
        predicted_word_ids = tf.argmax(logits, axis=2)
        predicted_words = tokenizer.sequences_to_texts(predicted_word_ids.numpy())
        captions.extend(predicted_words)

        text_embeddings = perform_word_embedding(predicted_words, vocab_size, embedding_dim)

    return captions

image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
generated_captions = generate_captions(image_paths)

# Step 10: Save the Model
image_encoder.save("image_encoder.h5")
text_decoder.save("text_decoder.h5")
