In [2]:
# import distutils
import numpy as np
from pickle import load 
from pickle import dump
from PIL import Image 
import os
import string
import tensorflow as tf 
import pandas as pd

In [10]:
def load_file_path(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def map_image_to_captions(filename):
    file  = load_file_path(filename)
    lines  = file.strip().splitlines()
    image_to_caption = {}
    for line in lines:
        img, caption = line.split(',', 1)
        img = img.strip()
        caption = caption.strip()

        if img not in image_to_caption:
            image_to_caption[img] = [caption]
        else:
            image_to_caption[img].append(caption)
    return image_to_caption

def clean_captions(image_to_caption):
    translation_table = str.maketrans('', '', string.punctuation)

    for img, caps in image_to_caption.items():
        for i in range(len(caps)):
            img_caption = caps[i].replace("-", " ")
            tokens = img_caption.split()
            cleaned_words = [
                word.lower().translate(translation_table) for word in tokens
                if len(word) > 1 and word.isalpha()
            ]
            caps[i] = ' '.join(cleaned_words)
    return image_to_caption


def create_vocab(image_to_caption):
    vocab = set()
    for img, caps in image_to_caption.items():
        for caption in caps:
            vocab.update(caption.split()) 

    return vocab


def save_img_to_captions(image_to_caption, filename_to_save):
    lines = []
    for img, caps in image_to_caption.items():
        for caption in caps:
            lines.append(img + '\t' + caption)

    data = "\n".join(lines) 
    with open(filename_to_save, "w") as file:
        file.write(data)


In [None]:
dataset_text = r'C:\Users\user\Desktop\ML\Image_Caption_Generator\flickr8k\captions.txt'

image_to_caption = map_image_to_captions(dataset_text)
print("length of dictionary = " ,len(image_to_caption))

clean_texts = clean_captions(image_to_caption)

vocabulary = create_vocab(clean_texts)
print('length of vocabulary =', len(vocabulary))

save_img_to_captions(clean_texts, 'img_caption.txt')

length of dictionary =  8092
length of vocabulary = 8405


## Feature Extraction

In [6]:
from tensorflow.keras.applications.xception import Xception #to get pre-trained model Xception
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
# from keras.preprocessing.text import Tokenizer #for text tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import add
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense#Keras to build our CNN and LSTM
from tensorflow.keras.layers import LSTM, Embedding, Dropout, Reshape, concatenate, Bidirectional
from tqdm import tqdm

# from tqdm import tqdm_notebook as tqdm #to check loop progress
# tqdm().pandas()

In [5]:
model = Xception(include_top=False, pooling = 'avg')


In [None]:
def extract_features(directory):
    features = {}
    for pic in tqdm(os.listdir(directory)):
        file_path = os.path.join(directory,pic)

        img = Image.open(file_path)
        img = img.resize((299,299))
        img_array = np.expand_dims(np.array(img) / 127.5 - 1.0, axis=0)
        
        features_vector = model.predict(img_array, verbose=0)
        features[pic] = features_vector
    return features

In [None]:
dataset_images = r'C:\Users\user\Desktop\ML\Image_Caption_Generator\flickr8k\Images'
features = extract_features(dataset_images)

with open("features.p", "wb") as file:
    dump(features,file)

In [None]:
with open("features.p", "rb") as file:
    img_features = load(file)


## Tokenizing the Captions

In [3]:
df_tok = pd.read_csv(r'C:\Users\user\Desktop\ML\Image_Caption_Generator\img_caption.txt', delimiter='\t')
print(df_tok.head(6))

                       image                                            caption
0  1000268201_693b08cb0e.jpg  child in pink dress is climbing up set of stai...
1  1000268201_693b08cb0e.jpg                    girl going into wooden building
2  1000268201_693b08cb0e.jpg         little girl climbing into wooden playhouse
3  1000268201_693b08cb0e.jpg   little girl climbing the stairs to her playhouse
4  1000268201_693b08cb0e.jpg  little girl in pink dress going into wooden cabin
5  1001773457_577c3a7d70.jpg             black dog and spotted dog are fighting


In [10]:
captions_on_tok =df_tok['caption'].tolist() 
captions_on_tok = [caption for caption in captions_on_tok if isinstance(caption,str)]
print(captions_on_tok[0:5])

['child in pink dress is climbing up set of stairs in an entry way', 'girl going into wooden building', 'little girl climbing into wooden playhouse', 'little girl climbing the stairs to her playhouse', 'little girl in pink dress going into wooden cabin']


In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions_on_tok)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions_on_tok)
tokenizer.texts_to_sequences([captions_on_tok[1]])[0]

[16, 310, 61, 191, 114]

## Train-test split

In [12]:
images = df_tok['image'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

train = df_tok[df_tok['image'].isin(train_images)]
test = df_tok[df_tok['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

In [13]:
def custom_data_generator(df, X_col, y_col, batch_size, directory, tokenizer, 
                          vocab_size, max_length, features, shuffle=True):
    data = df.copy()
    n = len(data)
    indices = np.arange(n)
    if shuffle:
        np.random.shuffle(indices)

    # Generator loop
    while True:
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            batch_indices = indices[start:end]
            batch = data.iloc[batch_indices]
            
            X1, X2, y = [], [], []

            # Loop through each image in the batch
            for _, row in batch.iterrows():
                image_id = row[X_col]
                feature = features[image_id][0]
                
                # Extract captions and process sequences
                captions = row[y_col] if isinstance(row[y_col], list) else [row[y_col]]
                for caption in captions:
                    seq = tokenizer.texts_to_sequences([caption])[0]
                    
                    for i in range(1, len(seq)):
                        in_seq, out_seq = seq[:i], seq[i]
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                        
                        X1.append(feature)
                        X2.append(in_seq)
                        y.append(out_seq)
            
            # Convert lists to numpy arrays for model compatibility
            X1, X2, y = np.array(X1), np.array(X2), np.array(y)
            
            yield (X1, X2), y

        # Shuffle data at the end of each epoch
        if shuffle:
            np.random.shuffle(indices)


In [14]:
input1 = Input(shape=(2048,))
input2 = Input(shape=(max_length,))

img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)
merged = concatenate([img_features_reshaped,sentence_features],axis=1)
sentence_features = LSTM(256)(merged)
x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

caption_generator_model = Model(inputs=[input1,input2], outputs=output)
caption_generator_model.compile(loss='categorical_crossentropy',optimizer='adam')

  super().__init__(**kwargs)


In [15]:
from tensorflow.keras.utils import plot_model 

In [16]:
plot_model(caption_generator_model)

You must install pydot (`pip install pydot`) for `plot_model` to work.
