## Image Captioning - Pretrained Resnet50 with Attention

In [None]:
import os
from pathlib import Path
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
from PIL import Image
from collections import Counter

#PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

#Vision
import torchvision.transforms as T
import torchvision.models as models

#Torchtext
import torchtext; torchtext.disable_torchtext_deprecation_warning() #supress warning
from torchtext.vocab import vocab
#from torchtext.data.utils import get_tokenizer


Mapping google drive and extracting zip file

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# with zipfile.ZipFile('/content/drive/MyDrive/archive.zip') as z_temp:
#     z_temp.extractall()

In [3]:
#Run if needed
# with zipfile.ZipFile('data/archive.zip') as z_temp:
#     z_temp.extractall(path='data/')

Check caption.txt file and get the random raws (7 samples) from it

In [4]:
project_root = Path.cwd().parent
data_path = project_root / 'data' / 'flickr30k_images' / 'results.csv'

image_captions = pd.read_csv(data_path, sep='|')
image_captions.columns = image_captions.columns.str.strip()
image_captions.sample(7)

Unnamed: 0,image_name,comment_number,comment
73272,3494034357.jpg,2,Two women are on the tennis court wearing ski...
115277,4755772591.jpg,2,A group takes in an area decorated in asian c...
123072,4876517154.jpg,2,A scruffy man wearing a nun 's habit smokes a...
77786,3589895574.jpg,1,A little girl in a sweatshirt jacket holding ...
99254,4445556418.jpg,4,Children in an African schoolhouse .
106545,4611790729.jpg,0,A group of people standing on a city street i...
2628,116626604.jpg,3,A kid laughing as he is sitting next to a big...


In [5]:
image_captions.shape

(158915, 3)

As we can see our caption file is the size of (158915, 3):

**image_name:** This is for filenames of the images, such as 3756150099.jpg, 4014757090.jpg, etc. Each file related to a unique image in the dataset.

**comment_number:** This column is an identifier for the caption related with each image. Since each image in flickr30k might have multiple captions, comment_number distinguishes between them (e.g., 0, 1, 2, etc.).

**comment:** This column is for the actual caption text describing the content of the image.

In [6]:
#Handeling device selection
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

Here we are using "en_core_web_lg" from SpaCy for tokenizer:

In [7]:
nlp_model = spacy.load('en_core_web_lg')

#Tokenizer function
def spacy_tokenizer(text):
    doc = nlp_model(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]
    return tokens

counter = Counter()

#Sample for testing
sample_text = 'An older man in a dark suit is speaking into a microphone at a podium with an " AMBUC " banner .'
print(spacy_tokenizer(sample_text))


['an', 'old', 'man', 'in', 'a', 'dark', 'suit', 'be', 'speak', 'into', 'a', 'microphone', 'at', 'a', 'podium', 'with', 'an', 'ambuc', 'banner']


Using Torchtext to built vocabulary for our captions:

In [None]:
#set path for saving location
project_root = Path().resolve().parent
os.chdir(project_root)

In [None]:
from src.vocab_builder import build_vocab, load_vocab

#Replace non-string values with empty string
image_captions['comment'] = image_captions['comment'].apply(lambda x: x if isinstance(x, str) else '')

text_lines = image_captions['comment'].tolist()

#Uncomment or comment based on your need
vocab = build_vocab(text_lines, min_freq=2)
#vocab = load_vocab()


Vocabulary saved to saved_model/vocab.pth


In [18]:
words = vocab.get_itos()
print("Words in our vocab:", words)

vocab_length = len(vocab)
print(f"Total number of words in vocab: {vocab_length}")

Total number of words in vocab: 9025
