<h1>Real-Time Video Captioning</h1>

<h3>Importing Libraries and Dependecies</h3>

In [55]:
import os
import time
import sys
import string
import pickle

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input, decode_predictions
from keras.applications.xception import Xception

from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add, Concatenate, Flatten, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


<h3>Data Extraction</h3>

In [46]:
df = pd.read_csv("captions.txt", nrows=1200)

In [47]:
df.head(6)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
5,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting


<h3>Data Exploratory Analysis</h3>

In [13]:
df.shape

(10000, 2)

In [5]:
df.columns

Index(['image', 'caption'], dtype='object')

In [6]:
df.dtypes

image      object
caption    object
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    40455 non-null  object
 1   caption  40455 non-null  object
dtypes: object(2)
memory usage: 632.2+ KB


In [14]:
df.describe()

Unnamed: 0,image,caption
count,10000,10000
unique,2000,9977
top,1000268201_693b08cb0e.jpg,Two dogs play in the grass .
freq,5,3


<h3>Data Preprocessing Steps</h3>

<h4>1. Validation and Cleansing</h4>

In [9]:
df.isna().sum()

image      0
caption    0
dtype: int64

In [8]:
df.duplicated().sum()

10

In [48]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

<h4>2. Image Resizing and Reshapping</h4>

In [49]:
images = []
for counter, img in enumerate(df['image']):
    image = cv2.imread(f"../dataset/images/{img}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized_image = cv2.resize(image_rgb, (224, 224))
    images.append(resized_image)
    
    if counter % 5000 == 0:
        print(f"Processing {counter} of {df['image'].shape[0]}...\n")

Processing 0 of 1200...



In [50]:
len(images)

1200

In [51]:
np.save("../dataset/images_array.npy", np.array(images))

In [52]:
images = np.load("../dataset/images_array.npy")

In [53]:
images.shape

(1200, 224, 224, 3)

<h4>3. Caption Normalization</h4>

In [22]:
df['caption'] = df['caption'].str.lower()

In [23]:
df.head(3)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .


In [24]:
df['caption'] = df['caption'].str.translate(str.maketrans('', '', string.punctuation))

In [64]:
# Create tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['caption'])

# Add entries for special tokens
tokenizer.word_index['startseq'] = len(tokenizer.word_index) + 1
tokenizer.word_index['endseq'] = len(tokenizer.word_index) + 1

# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(df['caption'])

# Add start and end tokens to each sequence
sequences_with_start_end = [[tokenizer.word_index['startseq']] + seq + [tokenizer.word_index['endseq']] for seq in sequences]

# Pad sequences to a consistent length
padded_sequences = pad_sequences(sequences_with_start_end, padding='post')

In [90]:
(sequences_with_start_end)

[[1391,
  2,
  34,
  3,
  2,
  97,
  138,
  6,
  69,
  52,
  2,
  343,
  11,
  545,
  3,
  25,
  736,
  344,
  1392],
 [1391, 2, 15, 286, 42, 2, 132, 122, 1392],
 [1391, 2, 32, 15, 69, 42, 2, 132, 546, 1392],
 [1391, 2, 32, 15, 69, 4, 545, 22, 59, 546, 1392],
 [1391, 2, 32, 15, 3, 2, 97, 138, 286, 42, 2, 132, 737, 1392],
 [1391, 2, 16, 9, 7, 2, 738, 9, 19, 739, 1392],
 [1391, 2, 16, 9, 7, 2, 740, 178, 9, 29, 8, 139, 88, 5, 4, 133, 1392],
 [1391,
  2,
  16,
  9,
  7,
  2,
  17,
  9,
  8,
  21,
  741,
  19,
  547,
  14,
  139,
  88,
  3,
  4,
  65,
  1392],
 [1391, 12, 38, 11, 287, 548, 72, 14, 139, 88, 5, 4, 133, 1392],
 [1391, 12, 38, 5, 420, 421, 549, 139, 88, 1392],
 [1391,
  2,
  32,
  15,
  101,
  3,
  742,
  63,
  3,
  41,
  11,
  2,
  241,
  345,
  8,
  59,
  198,
  3,
  2,
  743,
  1392],
 [1391, 2, 32, 15, 6, 35, 3, 41, 11, 2, 64, 241, 345, 1392],
 [1391,
  2,
  46,
  15,
  3,
  4,
  45,
  89,
  8,
  744,
  3,
  41,
  11,
  2,
  17,
  550,
  8,
  2,
  345,
  5,
  109,
  1392],


In [126]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

1393

In [95]:
max_length = np.max([len(seq) for seq in padded_sequences])
max_length

31

In [128]:
# Function to load GloVe embeddings into a dictionary
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Provide the path to your downloaded GloVe file
glove_file_path = '../dataset/embed/glove.6B.100d.txt'  # Change the path and dimensionality accordingly

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(glove_file_path)

In [129]:
# Create an embedding matrix for your vocabulary
embedding_dim = 100  # Change the dimensionality based on your GloVe model

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [130]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.27085999,  0.044006  , -0.02026   , ..., -0.4923    ,
         0.63687003,  0.23642001],
       ...,
       [-0.022675  ,  0.34329   ,  0.18151   , ...,  0.14631   ,
         0.65473002,  0.0028126 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [131]:
embedding_matrix.shape

(1393, 100)

# Model Development

In [96]:
# img_model = Xception( include_top=False, pooling='avg' )

img_model = VGG16()
img_model = Model(inputs=img_model.inputs, outputs=img_model.layers[-2].output)

In [97]:
img_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [54]:
img_features = img_model.predict(images)



In [82]:
# np.save("../dataset/img_features.npy", img_features)

In [83]:
img_features = np.load("../dataset/img_features.npy")

In [84]:
img_features.shape

(1200, 4096)

In [85]:
img_model.output.shape[1]

4096

In [132]:
image_input = Input(shape=img_model.output.shape[1],)
dropout_image = Dropout(rate=0.45) (image_input)
dense_image = Dense(256, activation="relu")(dropout_image)

caption_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length,
                           weights=[embedding_matrix], trainable=False)(caption_input)
dropout_caption = Dropout(rate=0.45) (embedding_layer)
lstm = LSTM(256, return_sequences=True)(dropout_caption)


merged_layer = add([dense_image, lstm])
decoder = Dense(256, activation='relu')(merged_layer)
output = Dense(vocab_size, activation='softmax')(lstm)

# Create the caption generation model
caption_model = Model(inputs=[image_input, caption_input], outputs=output)

caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

caption_model.summary()


Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, 31)]                 0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 31, 100)              139300    ['input_15[0][0]']            
                                                                                                  
 dropout_7 (Dropout)         (None, 31, 100)              0         ['embedding_5[0][0]']         
                                                                                                  
 lstm_4 (LSTM)               (None, 31, 256)              365568    ['dropout_7[0][0]']           
                                                                                            

In [124]:
target = to_categorical(padded_sequences)
target.shape

(1200, 31, 1393)

In [133]:
caption_model.fit([img_features, padded_sequences], target, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1d2b6040e10>

In [135]:
# with open("../dataset/caption_model.pkl", "wb") as model_file:
#     pickle.dump(caption_model, model_file)

In [136]:
model = None
with open("../dataset/caption_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)
    print(model.summary())

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_15 (InputLayer)       [(None, 31)]                 0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 31, 100)              139300    ['input_15[0][0]']            
                                                                                                  
 dropout_7 (Dropout)         (None, 31, 100)              0         ['embedding_5[0][0]']         
                                                                                                  
 lstm_4 (LSTM)               (None, 31, 256)              365568    ['dropout_7[0][0]']           
                                                                                            

In [145]:
sample_img = cv2.imread("../dataset/sample_img/sample1.jpg")
img_rgb = cv2.cvtColor(sample_img, cv2.COLOR_BGR2RGB)
resized_img = cv2.resize(img_rgb, (224, 224))
final_img = np.expand_dims(resized_img, 0)

img_feat = img_model.predict(final_img)



In [147]:
img_feat.shape

(1, 4096)

In [186]:
caption_length = 50
fin_text = "startseq"

def convert_index_to_word(idx, tokenizer):
    for w, i in tokenizer.word_index.items():
         if i == idx:
                return w
    return None

pred = None
for idx in range(caption_length):
    seq = tokenizer.texts_to_sequences([fin_text])[0]
    pad_seq = pad_sequences([seq], maxlen=max_length)
    pred = caption_model.predict([img_feat, pad_seq], verbose=0)[0]
    p = np.argmax(pred[-1])
    pred_word = convert_index_to_word(p + 1, tokenizer)
    print(pred_word)
    fin_text = fin_text + ' ' + pred_word
    print(fin_text)

<OOV>
startseq <OOV>
<OOV>
startseq <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
<OOV>
startseq <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV