In [1]:
import nltk
import re
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

In [2]:
df_train = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/train.csv")
df_test = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/test.csv")
df_val = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/val.csv")

In [3]:
df_train.columns

Index(['Unnamed: 0', 'filename', 'caption'], dtype='object')

In [4]:
df_train.drop(['Unnamed: 0'],axis = 1)
df_test.drop(['Unnamed: 0'],axis = 1)
df_val.drop(['Unnamed: 0'],axis = 1)

Unnamed: 0,filename,caption
0,val_1.jpg,A group of men stand in the snow with skis. Th...
1,val_2.jpg,A small short haired brown dog walks down the ...
2,val_3.jpg,There are two elephants standing on the ground...
3,val_4.jpg,A very detailed hotel room. The bedding is blu...
4,val_5.jpg,Beautiful elephants are showed at a festival o...
...,...,...
941,val_942.jpg,A new stainless steel oven sits next to a refr...
942,val_943.jpg,A hard beige and gold color suitcase with a lo...
943,val_944.jpg,There is a boat with a sail on the shore of th...
944,val_945.jpg,Strips of an oily meat is layered over balls o...


In [5]:
df_test['caption'] = df_test['caption'].apply(lambda x :x.lower())
df_train['caption'] = df_train['caption'].apply(lambda x :x.lower())
df_val['caption'] = df_val['caption'].apply(lambda x :x.lower())

In [6]:
import string
exclude = string.punctuation
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))
    

In [7]:
df_test['caption'] = df_test['caption'].apply(remove_punc1)
df_train['caption'] = df_train['caption'].apply(remove_punc1)
df_val['caption'] = df_val['caption'].apply(remove_punc1)

## Vocabulary creation

In [8]:
corpus = df_test['caption'].tolist() + df_train['caption'].tolist() + df_val['caption'].tolist()

In [9]:
def tokenize_sentence(sentence):
    return word_tokenize(sentence)  

In [10]:
tokenized_corpus = [tokenize_sentence(x) for x in corpus]

In [11]:
vocabulary = set()
for tokens in tokenized_corpus:
    vocabulary.update(tokens)

In [12]:
word_to_idx, idx = {}, 4


word_to_idx['<PAD>'] = 0
word_to_idx['<START>'] = 1
word_to_idx['<UNK>'] = 2
word_to_idx['<END>'] = 3

for token in vocabulary:
    if token not in word_to_idx:
        word_to_idx[token] = idx
        idx += 1

In [13]:
len(word_to_idx)

9108

In [14]:
idx_to_word = {value:key for key,value in word_to_idx.items()}

In [15]:
caption_train, image_train = df_train['caption'].tolist(), df_train['filename'].tolist()

In [16]:
caption_test, image_test = df_test['caption'].tolist(), df_test['filename'].tolist()
caption_val, image_val = df_val['caption'].tolist(), df_val['filename'].tolist()

In [17]:
token_train = [tokenize_sentence(x) for x in caption_train ]
token_test = [tokenize_sentence(x) for x in caption_test ]
token_val = [tokenize_sentence(x) for x in caption_val ]

In [18]:
len(token_train[0])

69

In [19]:
max_len_train = max([len(seq) for seq in token_train])
# max_len_test = max([len(seq) for seq in token_test])
# max_len_val = max([len(seq) for seq in token_val])
# print(max_len_train, max_len_test, max_len_val)

In [20]:
padded_train = [seq + ['<PAD>']*(max_len_train - len(seq))for seq in token_train]
padded_test = [seq + ['<PAD>']*(max_len_train - len(seq))for seq in token_test]
padded_val = [seq + ['<PAD>']*(max_len_train - len(seq))for seq in token_val]


In [21]:
len(padded_test[0])

259

## Custom DataSet Creation

In [22]:

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),           # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images
])



class CustomDataset(Dataset):
    
    def __init__(self, image_dir, image_filename_list, captions_list, transform = transform, word_to_idx = None):
        self.image_dir = image_dir
        self.captions_list = captions_list
        self.image_filename_list = image_filename_list
        self.transform = transform
        self.word_to_idx = word_to_idx
        

    def __len__(self):
        return len(self.captions_list)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_filename_list[idx])
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)

        caption = [self.word_to_idx['<START>']] + [self.word_to_idx[word] if word in word_to_idx else self.word_to_idx['<UNK>'] for word in self.captions_list[idx]] + [self.word_to_idx['<END>']]
        return image, torch.tensor(caption)

In [23]:
image_dir = "/home/rajib/dl_project/custom_captions_dataset/train"
captions_list = padded_train
image_filename_list = image_train
dataset_train = CustomDataset(image_dir,image_filename_list, captions_list, word_to_idx=word_to_idx)
data_loader_train = DataLoader(dataset_train, batch_size= 16, shuffle= True)


In [24]:
img, caption = next(iter(data_loader_train))
print(img.shape, caption.shape)

torch.Size([16, 3, 224, 224]) torch.Size([16, 261])


In [25]:
image_dir = "/home/rajib/dl_project/custom_captions_dataset/test"
captions_list = padded_test
image_filename_list = image_test
dataset_test = CustomDataset(image_dir,image_filename_list, captions_list, word_to_idx=word_to_idx)
data_loader_test = DataLoader(dataset_test, batch_size= 16, shuffle= True)


image_dir = "/home/rajib/dl_project/custom_captions_dataset/val"
captions_list = padded_val
image_filename_list = image_val
dataset_val = CustomDataset(image_dir,image_filename_list, captions_list, word_to_idx=word_to_idx)
data_loader_val = DataLoader(dataset_val, batch_size= 16, shuffle= True)



## Model Architecture

In [26]:
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]  # Remove the last layer (fully connected layer)
        self.resnet = nn.Sequential(*modules)
        self.embed = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.bn(self.embed(features))
        return features

In [27]:
cnn = EncoderCNN(100)
x = torch.randn([16, 3, 224, 224])
y = cnn(x)
print(y.shape)



torch.Size([16, 100])


In [28]:
print(y.unsqueeze(1).shape)
vocab_size = len(word_to_idx)


torch.Size([16, 1, 100])


In [29]:
embed = nn.Embedding(vocab_size, 100)
print(len(embed))

TypeError: object of type 'Embedding' has no len()

In [30]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        
    def forward(self, features, captions):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        packed = pack_padded_sequence(embeddings,lengths = [len(cap)-1 for cap in captions] ,batch_first=True) 
        hiddens, _ = self.lstm(packed)
        outputs = self.linear(hiddens[0])
        return outputs
    
    def sample(self, features, states=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs = features.unsqueeze(1)
        for i in range(self.max_seg_length):
            hiddens, states = self.lstm(inputs, states)          # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))            # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                        # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs = inputs.unsqueeze(1)                         # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                # sampled_ids: (batch_size, max_seq_length)
        return sampled_ids

In [31]:
device = device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [32]:
# Define hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 1
max_seq_length = 256
learning_rate = 0.001
num_epochs = 10
vocab_size = len(word_to_idx)

# Initialize the encoder and decoder
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers, max_seq_length)
encoder = encoder.to(device)
decoder = decoder.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
parameters = list(encoder.parameters()) + list(decoder.parameters())
optimizer = torch.optim.Adam(parameters, lr=learning_rate)

encoder.train()
decoder.train()
# Training loop
for epoch in tqdm(range(num_epochs)):
    total_loss = 0
    for _,(images,captions) in enumerate(data_loader_train):
        # Move inputs to device
        images = images.to(device)
        captions = captions.to(device)
       
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        features = encoder(images)
        outputs = decoder(features, captions )
        
        # Calculate loss
        targets = pack_padded_sequence(captions, lengths =[len(cap)-1 for cap in captions], batch_first=True).data
        loss = criterion(outputs, targets)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print average loss for the epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(data_loader_train)}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10], Loss: 1.229399421528065
Epoch [2/10], Loss: 0.9193818374386047
Epoch [3/10], Loss: 0.8343743756829693
Epoch [4/10], Loss: 0.7736146593227067
Epoch [5/10], Loss: 0.7221614698457984
Epoch [6/10], Loss: 0.6765728505463574
Epoch [7/10], Loss: 0.6333060908916942
Epoch [8/10], Loss: 0.5911587264451235
Epoch [9/10], Loss: 0.5525189567378114
Epoch [10/10], Loss: 0.5159301804430658


In [33]:


# Assuming you have a DataLoader named data_loader_val
# Iterate over the validation dataset
with torch.no_grad():  # No need to track gradients during validation
    for images, captions in data_loader_val:
        # Move inputs to device
        images = images.to(device)
        captions = captions.to(device)
        
        # Forward pass
        features = encoder(images)
        sampled_ids = decoder.sample(features)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
        
        # Convert word_ids to words
        sampled_caption = []
        for x in sampled_ids:
            word = idx_to_word[x]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out the image and the generated caption
        print (sentence)
        # image = Image.open(images.cpu())
        # plt.imshow(np.asarray(image))



<START> a man is standing on a tennis court playing tennis he is wearing a white shirt and blue shorts he is holding a racket in his hand and is wearing a white shirt and blue shorts he is holding a white racket in his hand and he is wearing a white shirt and blue jean shorts he is standing in front of a white wall behind the man <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

In [60]:
with torch.no_grad():
    for images, captions in data_loader_val:
        index = captions[0].numpy()
        print(captions.shape)

        sampled_caption = []
        for x in index:
            word = idx_to_word[x]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)  
        print(sentence)      

    

torch.Size([16, 261])
<START> a plate of food is sitting on a wooden block there is a glass next to the plate there is a pile of carrots and a pickle on the plate <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PA