<h1>Real-Time Video Captioning</h1>

<h3>Importing Libraries and Dependecies</h3>

In [71]:
import string
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from keras.utils import pad_sequences

<h3>Data Extraction</h3>

In [5]:
df = pd.read_csv("captions.txt")

In [6]:
df.head(6)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
5,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting


<h3>Data Exploratory Analysis</h3>

In [7]:
df.shape

(40455, 2)

In [8]:
df.columns

Index(['image', 'caption'], dtype='object')

In [9]:
df.dtypes

image      object
caption    object
dtype: object

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   image    40455 non-null  object
 1   caption  40455 non-null  object
dtypes: object(2)
memory usage: 632.2+ KB


In [11]:
df.describe()

Unnamed: 0,image,caption
count,40455,40455
unique,8091,40201
top,1000268201_693b08cb0e.jpg,Two dogs playing in the snow .
freq,5,7


<h3>Data Preprocessing Steps</h3>

<h4>1. Validation and Cleansing</h4>

In [12]:
df.isna().sum()

image      0
caption    0
dtype: int64

In [13]:
df.duplicated().sum()

10

In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.duplicated().sum()

0

<h4>2. Image Resizing and Reshapping</h4>

In [16]:
for counter, img in enumerate(df['image']):
    image = cv2.imread(f"../dataset/images/{img}")
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized_image = cv2.resize(image_rgb, (200, 200))

<h4>3. Caption Normalization</h4>

In [17]:
df['caption'] = df['caption'].str.lower()

In [18]:
df.head(3)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .


In [20]:
df['caption'] = df['caption'].str.translate(str.maketrans('', '', string.punctuation))

In [22]:
tokenized_caption = [word_tokenize(caption) for caption in df['caption']]

In [28]:
stop_words = stopwords.words('english')

In [31]:
filtered_caption = [[word for word in caption if word not in stop_words] for caption in tokenized_caption]

In [39]:
vocab = {}
for caption_tokens in filtered_caption:
    for token in caption_tokens:
        vocab[token] = vocab.get(token, 0) + 1

In [49]:
vocab_size = len(vocab)

In [50]:
vocab_size

8711

In [55]:
tokenized_tokens = [[vocab.get(word) for word in caption] for caption in filtered_caption]

In [57]:
len(tokenized_tokens)

40445

In [43]:
max_length = np.max([len(caption_tokens) for caption_tokens in filtered_caption])
max_length

21

In [62]:
padded_tokens = pad_sequences(tokenized_tokens, maxlen=max_length, padding="pre")

In [68]:
df = pd.DataFrame(padded_tokens)
pd.get_dummies(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1545,735,348,502,108,109,1,50
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3324,149,284,510
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1767,3324,502,284,6
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1767,3324,502,109,6
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1767,3324,735,348,149,284,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,7264,735,1806,200,751,482
40441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,7264,751,502,282,1058
40442,0,0,0,0,0,0,0,0,0,0,...,0,1542,2672,1806,502,751,482,306,3,12
40443,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,751,124,2672,1806


In [69]:
padded_tokens.shape

(40445, 21)

In [72]:
ohe = OneHotEncoder()

In [78]:
encoded_df = ohe.fit_transform(df).toarray()

In [79]:
pd.DataFrame(encoded_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4031,4032,4033,4034,4035,4036,4037,4038,4039,4040
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40440,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40441,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40442,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40443,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
