# Starting Point

In [1]:
# Install necessary libraries 
!pip install transformers
!pip install torch torchvision
!pip install nltk
!pip install rouge-score
!pip install wer
!pip install gtts


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9012c9bfae888ba9c32e695a91b7acd70ae074f6674fe2a91616f24b14f101b3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting wer
  Downloading wer-0.1.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting eulxml (from wer)
  Downloading eulxml-1.1.3-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting ply>=3.8 (from eulxml->wer)
  Downloading ply-3.11-py2.py3-none-any.whl.metadata (844 bytes)
Downloading wer-0.1.0-py2.py3-none-any.whl (4.7 kB)
Downloading eulxml-1.1.3-p

In [None]:
import os
import random
import pickle
import warnings
import numpy as np
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Bidirectional, BatchNormalization, RepeatVector, Dot, Activation, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Add
import matplotlib.image as mpimg
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from tensorflow.keras.utils import to_categorical, plot_model
from nltk.translate.bleu_score import corpus_bleu


from rouge_score import rouge_scorer

warnings.filterwarnings("ignore", category=FutureWarning)


import nltk

from gtts import gTTS
from IPython.display import Audio, display

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Directories

In [4]:
BASE_DIR = '/kaggle/input/flickr8k'
WORKING_DIR = '/kaggle/working'


# Load and Process Captions

In [5]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

mapping = {}
for line in tqdm(captions_doc.split('\n')):
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split('.')[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)


100%|██████████| 40456/40456 [00:00<00:00, 691152.14it/s]


**Demo of og Caption**

In [6]:
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
    next(f)  
    for i, line in enumerate(f):
        print(line)  
        if i == 1:  
            break


1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .

1000268201_693b08cb0e.jpg,A girl going into a wooden building .



In [7]:
example_keys = list(mapping.keys())[:2]  
for key in example_keys:
    print(f"Image ID: {key}")
    for caption in mapping[key]:
        print(f"Caption: {caption}")
    print("-" * 50)  


Image ID: 1000268201_693b08cb0e
Caption: A child in a pink dress is climbing up a set of stairs in an entry way .
Caption: A girl going into a wooden building .
Caption: A little girl climbing into a wooden playhouse .
Caption: A little girl climbing the stairs to her playhouse .
Caption: A little girl in a pink dress going into a wooden cabin .
--------------------------------------------------
Image ID: 1001773457_577c3a7d70
Caption: A black dog and a spotted dog are fighting
Caption: A black dog and a tri-colored dog playing with each other on the road .
Caption: A black dog and a white dog with brown spots are staring at each other in the street .
Caption: Two dogs of different breeds looking at each other on the road .
Caption: Two dogs on pavement moving toward each other .
--------------------------------------------------


# Clean the Captions

In [8]:
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            caption = captions[i]
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')
            caption = caption.replace('\s+', ' ')
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

clean(mapping)


In [9]:
example_keys = list(mapping.keys())[:2]  
for key in example_keys:
    print(f"Image ID: {key}")
    for caption in mapping[key]:
        print(f"Cleaned Caption: {caption}")
    print("-" * 50)  


Image ID: 1000268201_693b08cb0e
Cleaned Caption: startseq child in pink dress is climbing up set of stairs in an entry way endseq
Cleaned Caption: startseq girl going into wooden building endseq
Cleaned Caption: startseq little girl climbing into wooden playhouse endseq
Cleaned Caption: startseq little girl climbing the stairs to her playhouse endseq
Cleaned Caption: startseq little girl in pink dress going into wooden cabin endseq
--------------------------------------------------
Image ID: 1001773457_577c3a7d70
Cleaned Caption: startseq black dog and spotted dog are fighting endseq
Cleaned Caption: startseq black dog and tri-colored dog playing with each other on the road endseq
Cleaned Caption: startseq black dog and white dog with brown spots are staring at each other in the street endseq
Cleaned Caption: startseq two dogs of different breeds looking at each other on the road endseq
Cleaned Caption: startseq two dogs on pavement moving toward each other endseq
---------------------

# Tokenize the Captions

In [10]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1


In [11]:
print("Before Tokenization:")
for i in range(2):
    print(all_captions[i]) 

print("\nAfter Tokenization:")
for i in range(2):
    sequence = tokenizer.texts_to_sequences([all_captions[i]])[0]  
    print(sequence)  


Before Tokenization:
startseq child in pink dress is climbing up set of stairs in an entry way endseq
startseq girl going into wooden building endseq

After Tokenization:
[1, 42, 3, 90, 172, 6, 119, 50, 393, 11, 394, 3, 27, 5146, 670, 2]
[1, 18, 316, 64, 196, 117, 2]


In [12]:
pickle.dump(tokenizer, open(os.path.join(WORKING_DIR, 'tokenizer.pkl'), 'wb'))


# Define Maximum Caption Length

In [14]:
max_length = max(len(caption.split()) for caption in all_captions)
print("Maximum Caption Length:",(max_length))

Maximum Caption Length: 35
