In [24]:
import string
import pickle
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
import tqdm
from tensorflow import keras
import tensorflow as tf

from keras.applications.xception import Xception, preprocess_input
from keras.applications import InceptionV3

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import load_img, img_to_array
from keras.layers import Concatenate

from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, GRU

## Getting and performing data cleaning

In [12]:
def load_doc(filename):
    with open(filename, 'r') as file:
        return file.read()

def img_captions(filename):
    with open(filename, 'r') as file:
        lines = file.read().strip().split('\n')
    descriptions = {}
    for line in lines:
        img, caption = line.split('\t')
        img_key = img[:-2]

        if img_key not in descriptions:
            descriptions[img_key] = [caption]
        else:
            descriptions[img_key].append(caption)
    return descriptions

def preprocess_captions(captions):
    table = str.maketrans('', '', string.punctuation)

    for img, caps in captions.items():
        captions[img] = [
            ' '.join(filter(lambda w: w.isalpha() and len(w) > 1, word.translate(table).lower().replace("-", " ").split()))
            for word in caps
        ]
    return captions

def get_vocabulary(descriptions):
    return set(word for key in descriptions for d in descriptions[key] for word in d.split())

def save_descriptions(descriptions, filename):
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
    data = "\n".join(lines)
    with open(filename, "w") as file:
        file.write(data)

In [13]:
dataset_images = "data/Flicker8k_Dataset/"
filename = "data/Flickr8k.token.txt"
descriptions = all_img_captions(filename)
print("Length of descriptions =" , len(descriptions))

clean_descriptions = cleaning_text(descriptions)

vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

save_descriptions(clean_descriptions, "data/descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8422


## Extracting the feature vector from all images

In [20]:
def extract_features(directory):
        model = InceptionV3(include_top=False, pooling='avg')
        features = {}
        for img in os.listdir(directory):
            img_path = os.path.join(directory, img)
            with Image.open(img_path) as image:
                image = image.resize((299, 299))
                image = np.expand_dims(image, axis=0)
                image = (image / 127.5) - 1.0
                feature = model.predict(image)
                features[img] = feature
        return features
features = extract_features(dataset_images)
dump(features, open("data/features.p","wb"))













































































In [21]:
features = load(open("data/features.p","rb"))

In [22]:
len(features)

8091

## Loading dataset for Training the model

In [25]:
#load the data 
def load_photos(filename):
    return load_doc(filename).split("\n")[:-1]

def load_clean_descriptions(filename, photos):
    """
    Load and clean descriptions from the specified file for the given photos.
    Returns:
    - A dictionary containing image filenames as keys and a list of cleaned descriptions as values.
    """
    file_content = load_doc(filename)
    descriptions = {}
    for line in file_content.split("\n"):
        words = line.split()
        if len(words) < 2:
            continue
        image, image_caption = words[0], words[1:]
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)
    return descriptions


def load_features(photos):
    all_features = pickle.load(open("data/features.p","rb"))
    features = {k:all_features[k] for k in photos}
    return features



filename = "data/Flickr_8k.trainImages.txt"

#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("data/descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

## Tokenizing the vocabulary 

In [26]:
train_descriptions

{'1000268201_693b08cb0e.jpg': ['<start> child in pink dress is climbing up set of stairs in an entry way <end>',
  '<start> girl going into wooden building <end>',
  '<start> little girl climbing into wooden playhouse <end>',
  '<start> little girl climbing the stairs to her playhouse <end>',
  '<start> little girl in pink dress going into wooden cabin <end>'],
 '1001773457_577c3a7d70.jpg': ['<start> black dog and spotted dog are fighting <end>',
  '<start> black dog and tri colored dog playing with each other on the road <end>',
  '<start> black dog and white dog with brown spots are staring at each other in the street <end>',
  '<start> two dogs of different breeds looking at each other on the road <end>',
  '<start> two dogs on pavement moving toward each other <end>'],
 '1002674143_1b742ab4b8.jpg': ['<start> little girl covered in paint sits in front of painted rainbow with her hands in bowl <end>',
  '<start> little girl is sitting in front of large painted rainbow <end>',
  '<sta

In [27]:
def dict_to_list(descriptions):
    return [d for key in descriptions for d in descriptions[key]]

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

tokenizer = create_tokenizer(train_descriptions)
pickle.dump(tokenizer, open('data/tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7318

In [29]:
print(len(train_imgs))
print(len(train_descriptions))
print(vocab_size)
print(len(train_features))

6000
6000
7318
6000


In [30]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)
    
max_length = max_length(descriptions)
max_length

33

##  Create Data generator

In [33]:
def data_generator(descriptions, features, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ([input_image, input_sequence], output_word)

def create_sequences(tokenizer, max_length, desc_list, feature):
    x1 = [] 
    x2 = []
    y = []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq = seq[:i]
            out_seq = seq[i]
            x1.append(feature)
            x2.append(pad_sequences([in_seq], maxlen=max_length)[0])
            y.append(to_categorical([out_seq], num_classes=vocab_size)[0])
    return np.array(x1), np.array(x2), np.array(y)

#You can check the shape of the input and output for your model
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape
#((47, 2048), (47, 32), (47, 7577))

((47, 2048), (47, 33), (47, 7318))

## Defining the CNN-GRU model

In [34]:
# Features squeezed from 2048 to 256 nodes
input1 = Input(shape=(2048,))
dropout_1 = Dropout(0.4)(input1)
dense_1 = Dense(1024, activation="relu")(dropout_1)
dropout_2 = Dropout(0.4)(dense_1)
dense_2 = Dense(256, activation='relu')(dropout_2)

# GRU(Gated Recurrent Units) model
input2 = Input(shape=(max_length,))
embed_1 = Embedding(vocab_size, 256, mask_zero=True)(input2)
dropout_2 = Dropout(0.4)(embed_1)
gru_layer = GRU(256)(dropout_2)

# Merging both models
concat = Concatenate(axis=-1)([dense_2, gru_layer])
dense_comb_1= Dense(256, activation='relu')(concat)
dropout_final = Dropout(0.4)(dense_comb_1)
outputs = Dense(vocab_size, activation='softmax')(dropout_final)

model = Model(inputs=[input1, input2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')




## TRAINING SECTION

In [35]:
# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)


Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7318
Description Length:  33


In [36]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 dropout (Dropout)           (None, 2048)                 0         ['input_3[0][0]']             
                                                                                                  
 input_4 (InputLayer)        [(None, 33)]                 0         []                            
                                                                                                  
 dense (Dense)               (None, 1024)                 2098176   ['dropout[0][0]']             
                                                                                              

In [49]:
generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
model.fit_generator(generator, epochs=5, steps_per_epoch=len(train_descriptions), verbose=1)

Epoch 1/5


  model.fit_generator(generator, epochs=5, steps_per_epoch=len(train_descriptions), verbose=1)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2752ff18610>