<a href="https://colab.research.google.com/github/sriraj0926/Capstone/blob/main/Capstone_Code_IC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow numpy nltk pillow



In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [10]:
import os
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from nltk.translate.bleu_score import corpus_bleu
import nltk

nltk.download('punkt')

# Define paths
dataset_dir = '/content/gdrive/MyDrive/Capstone_datasets/flickr8k_unzipped/'  # Change this to the path where the dataset is located
images_dir = os.path.join(dataset_dir, 'Images')
captions_file = os.path.join(dataset_dir, 'captions.txt')

# Load the captions into a dictionary
def load_captions(file):
    captions = {}
    with open(file, 'r') as f:
        for line in f:
            tokens = line.strip().split()
            image_id, image_caption = tokens[0], tokens[1:]
            image_id = image_id.split('.')[0]
            image_caption = ' '.join(image_caption)
            if image_id not in captions:
                captions[image_id] = []
            captions[image_id].append(image_caption)
    return captions

captions = load_captions(captions_file)

# Clean the captions
def clean_captions(captions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in captions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word) > 1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] = ' '.join(desc)

clean_captions(captions)

# Convert the captions to a vocabulary
def to_vocabulary(captions):
    all_captions = set()
    for key in captions.keys():
        [all_captions.update(d.split()) for d in captions[key]]
    return all_captions

vocabulary = to_vocabulary(captions)

# Save the captions to a file
def save_captions(captions, filename):
    lines = []
    for key, desc_list in captions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = '\n'.join(lines)
    with open(filename, 'w') as f:
        f.write(data)

save_captions(captions, 'captions.txt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:


import os
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add

# Load the dataset
def load_set(filename):
    with open(filename, 'r') as f:
        doc = f.read()
    dataset = set()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        identifier = line.split('.')[0]
        dataset.add(identifier)
    return dataset

# Load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
    with open(filename, 'r') as f:
        doc = f.read()
    descriptions = {}
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in dataset:
            if image_id not in descriptions:
                descriptions[image_id] = []
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

# Extract features from each photo in the directory
# Modify data generator to yield separate inputs for image features and text sequences
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size, batch_size):
    while True:
        keys = list(descriptions.keys())
        np.random.shuffle(keys)
        for key in keys:
            if key not in photos:
                continue
            photo = photos[key][0]  # Extract image feature
            for desc in descriptions[key]:
                # Tokenize and pad the text sequence
                seq = tokenizer.texts_to_sequences([desc])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    # Yield separate inputs for image features and text sequences
                    yield [photo, in_seq], out_seq

# Update model definition to accept separate inputs for image features and text sequences
def define_model(vocab_size, max_length, photo_feature_dim):
    # Image feature input
    inputs1 = Input(shape=(photo_feature_dim,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Text sequence input
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Combine image and text features
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Define model
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

# Load the dataset
dataset_dir = "/content/gdrive/MyDrive/Capstone_datasets/flickr8k_unzipped/"
images_dir = "/content/gdrive/MyDrive/Capstone_datasets/flickr8k_unzipped/Images"

train = load_set(os.path.join(dataset_dir, 'Flickr_8k.trainImages.txt'))
test = load_set(os.path.join(dataset_dir, 'Flickr_8k.testImages.txt'))

# Load clean descriptions into memory
train_descriptions = load_clean_descriptions('captions.txt', train)

# Extract features from images
train_features = extract_features(images_dir)

# Tokenize the text
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1

# Calculate the maximum sequence length
max_len = max_length(train_descriptions)

# Define the model
photo_feature_dim = 4096  # Dimensionality of image features
model = define_model(vocab_size, max_len, photo_feature_dim)

# Define batch size
batch_size = 64

# Calculate the number of steps per epoch
steps_per_epoch = len(train_descriptions) // batch_size

# Train the model
epochs = 20
generator = data_generator(train_descriptions, train_features, tokenizer, max_len, vocab_size, batch_size)
model.fit(generator, epochs=epochs, steps_per_epoch=steps_per_epoch, verbose=1)
model.save('final_model.h5')


Epoch 1/20


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 253, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'model_36' (type Functional).
    
    Input 0 of layer "dense_30" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (None,)
    
    Call arguments received by layer 'model_36' (type Functional):
      • inputs=('tf.Tensor(shape=(None,), dtype=float32)', 'tf.Tensor(shape=(None,), dtype=int32)')
      • training=True
      • mask=None
