# Image Captioning Final Project

## Names: Kavya Angara, Saurabh Arora, Parthiv Borgohain, Rudraksh Garg, Pratik Gawli

### TPU Setup Attempt

In [None]:
# !pip install cloud-tpu-client==0.10 torch==1.12.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.12-cp37-cp37m-linux_x86_64.whl

# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

# import torch_xla
# import torch_xla.core.xla_model as xm

# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

# import os 
# os.environ['LD_LIBRARY_PATH']='/kaggle/working'

# !echo $LD_LIBRARY_PATH
# !sudo ln -s /usr/local/lib/libmkl_intel_lp64.so /usr/local/lib/libmkl_intel_lp64.so.1
# !sudo ln -s /usr/local/lib/libmkl_intel_thread.so /usr/local/lib/libmkl_intel_thread.so.1
# !sudo ln -s /usr/local/lib/libmkl_core.so /usr/local/lib/libmkl_core.so.1

# !ldconfig
# !ldd /usr/local/lib/python3.7/dist-packages/torch/lib/libtorch.so

# import torch_xla
# import torch_xla.core.xla_model as xm

### Imports

In [None]:
# Data Libraries
import numpy as np
import pandas as pd

# Accessing files
import os

# Progress display
from tqdm import tqdm

# Sampling data
import random

# Plots
import matplotlib.pyplot as plt

# Vocab Counter
from collections import Counter

# Display and process images
from PIL import Image

# Deep learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# BLEU score calculations
from nltk.translate.bleu_score import sentence_bleu

### Google Colab connect to Google Drive files

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## Set up image folders and caption file locations

In [None]:
FLICKR_IMAGES_DIR = "/kaggle/input/flickr8k/Images/"
FLICKR_LABEL_PATH = "/kaggle/input/flickr8k/captions.txt"
CUSTOM_IMAGES_DIR = "/kaggle/input/custom-images/Images/"
CUSTOM_LABEL_PATH = "/kaggle/input/merged-data/custom_csv_all.csv"


### Data Collection

In [None]:
# Read custom build crime dataset
custom_df = pd.read_csv(CUSTOM_LABEL_PATH, names = ["image", "caption"])
# Convert captions to lower case
custom_df["caption"] = custom_df.caption.str.lower()
# Get full paths to the images for each caption
custom_df["image"] = CUSTOM_IMAGES_DIR + custom_df["image"]
# Only keep captions that we have image files for
custom_df['exists'] = custom_df['image'].astype(str).map(os.path.exists)
custom_df = custom_df[custom_df['exists'] == True][["image", 'caption']]
# Keep captions that have more than 1 word only
custom_df = custom_df[custom_df['caption'].str.split().str.len() > 1]
print(custom_df.shape)

In [None]:
# Read flickr dataset
flickr_df = pd.read_csv(FLICKR_LABEL_PATH)
# Convert captions to lower case
flickr_df["caption"] = flickr_df.caption.str.lower()
# Remove punctuation from captions
flickr_df["caption"] = flickr_df.caption.str.replace("[^\w\s]","")
# Remove extra white space from captions
flickr_df["caption"] = flickr_df.caption.str.strip()
# Get full paths to the images for each caption
flickr_df["image"] = FLICKR_IMAGES_DIR + flickr_df["image"]
# Only keep captions that we have image files for
flickr_df['exists'] = flickr_df['image'].astype(str).map(os.path.exists)
flickr_df = flickr_df[flickr_df['exists'] == True][["image", 'caption']]
# Keep captions that have more than 1 word only
flickr_df = flickr_df[flickr_df['caption'].str.split().str.len() > 1]
print(flickr_df.shape)

In [None]:
# Combine the crime dataset and the flickr dataset
df = pd.concat([flickr_df, custom_df])

In [None]:
# Pull random caption to confirm cleaning
df["caption"].sample(n=1).item()

### Train-Validation-Test Split

In [None]:
# Split into train test
train_images = random.sample(list(df["image"].unique()), int(len(df["image"].unique())*.8))
train = df[df["image"].isin(train_images)]
test = df[~df["image"].isin(train_images)]

In [None]:
# Split into train val
train_images = random.sample(list(train["image"].unique()), int(len(train["image"].unique())*.8))
val = train[~train["image"].isin(train_images)]
train = train[train["image"].isin(train_images)]

In [None]:
# Get sizes of each data set
print(train.shape)
print(val.shape)
print(test.shape)

In [None]:
# Create a custom dataset that holds images and their captions
class CustomDataset(torch.utils.data.Dataset):
    
    def __init__(self, images, captions, transforms, min_threshold = 5, max_length = 30):
        
        self.images = images
        self.captions = captions
        self.transform = transforms
        
        self.min_threshold = min_threshold
        self.max_length = max_length
        
        self.word_to_idx = None
        self.idx_to_word = None
        self.vocab_size = None
        
        self._build_vocab()
        self._tokenize_captions()
            
    def __getitem__(self, index):
        # Get the image path
        img_path = self.images[index]
        # Open the image file
        image = Image.open(self.images[index]).convert("RGB")
        # Transform the image
        image = self.transform(image)
        
        # Convert tokenized caption to tensor
        token = torch.as_tensor(self.captions[index])
        
        return image, token
    
    def __len__(self):
        return len(self.captions)
    
    def _build_vocab(self):
        # Get the frequency of each word occuring in all captions
        count_word = dict(Counter(word for sentence in self.captions for word in sentence.split()))

        self.word_to_idx = {}
        
        # Prefine a pad and start token
        self.word_to_idx["PAD"] = 0
        self.word_to_idx["START"] = 1

        idx = 2
        
        # For all captions
        for caption in self.captions:
            # get list of words in caption
            caption_words = caption.split()
            # for each word
            for word in caption_words:
                # if the word occurs more than defined threshold and is not already tokenized
                if word and count_word[word] >= self.min_threshold and word not in self.word_to_idx:
                    # Add to vocab as token
                    self.word_to_idx[word] = idx
                    idx += 1
        
        # Prefine an unknown and end token
        self.word_to_idx["UNKNOWN"] = idx
        self.word_to_idx["END"] = idx + 1

        # Create inverse lookup table
        self.idx_to_word = {v: k for k, v in self.word_to_idx.items()}
        self.vocab_size = len(self.word_to_idx)
        
    def _tokenize_captions(self):

        all_tokens = []
        # For each caption
        for caption in self.captions:
            # Add start token to as first word
            tokens = [self.word_to_idx["START"]]

            caption_words = caption.split()
            # For each word
            for word in caption_words:
                # need to add end token so if at max length of caption - 1, stop tokenizing
                if len(tokens) == self.max_length-1:
                    break

                try:
                    tokens.append(self.word_to_idx[word])
                except KeyError:
                    # word does not exist in vocab, add unknown
                    tokens.append(self.word_to_idx["UNKNOWN"])
            
            # Add end token at the end of the caption
            tokens.append(self.word_to_idx["END"])
            
            # If there is still more space to be filled in to reach max length
            while len(tokens) < self.max_length:
                # add a pad token
                tokens.append(self.word_to_idx["PAD"])
            
            all_tokens.append(tokens)
        
        it = iter(all_tokens)
        the_len = len(next(it))
        if not all(len(l) == the_len for l in it):
            raise ValueError()
        self.captions = all_tokens

        

In [None]:
# Give type of transforms to be done to the image
image_transforms = transforms.Compose([ 
    transforms.Resize(256),                          
    transforms.RandomCrop(224),                      
    transforms.RandomHorizontalFlip(),               
    transforms.ToTensor(),                           
    transforms.Normalize((0.485, 0.456, 0.406),      
                         (0.229, 0.224, 0.225))])

# Create datasets for train and validation
train_dataset = CustomDataset(list(train["image"]), list(train["caption"]), image_transforms)
val_dataset = CustomDataset(list(val["image"]), list(val["caption"]), image_transforms)


In [None]:
train_dataset.vocab_size

In [None]:
max_length = train_dataset.max_length
vocab_size = train_dataset.vocab_size
batch_size = 32
epochs = 10


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

### Create Models

In [None]:
# Create a CNN model that will define what is in the image
class CNN(nn.Module):

    def __init__(self, dropout=0.5, embedding_size = 256):
        super().__init__()
        # Using pretrained resnet 50 model
        self.cnn = torchvision.models.resnet50(pretrained=True)
        # Dropout to reduce overfitting
        self.dropout = nn.Dropout(dropout)
        # Convert to embedding size for LSTM
        self.linears = nn.Sequential(
            nn.Linear(1000, embedding_size*2),
            nn.ReLU(),
            nn.Linear(embedding_size*2, embedding_size),
            nn.ReLU()
        )

    def forward(self, img):
        return self.linears(self.dropout(self.cnn(img)))

# Create an LSTM model that will generate text to describe image
class LSTM(nn.Module):

    def __init__(self, vocab_size, word_to_idx, idx_to_word, embedding_size=256, hidden_size = 256, num_layers = 1, bidirectional = False, dropout=0.5):
        super().__init__()
       
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        
        self.num_layers = num_layers
        # Embedding layer for looking up vocab and their embedding
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, padding_idx=0)
        # Main model
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        # Predict next word
        self.classifier = nn.Sequential(
            nn.Linear(embedding_size, embedding_size*2),
            nn.ReLU(),
            nn.Linear(embedding_size*2, embedding_size*4),
            nn.ReLU(),
            nn.Linear(embedding_size*4, embedding_size*8),
            nn.ReLU(),
            nn.Linear(embedding_size*8, vocab_size)
        )
        # Reduce overfitting
        self.dropout = nn.Dropout(dropout)
        # Convert to probabilties
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, features, caption):
        # Output from CNN is input into LSTM
        features = torch.stack([features]*(self.num_layers), dim=0)
        # Get the embedding of the caption
        caption = self.dropout(self.embedding(caption))
        # Run caption and features through LSTM
        lstm_out, _ = self.lstm(caption, (features, features))
        # Return vector of next most likely word values
        return self.classifier(lstm_out)

    def predict(self, features):
        # Output from CNN is input into LSTM
        features = torch.stack([features]*(self.num_layers), dim=0)
        output = []
        hidden = (features, features)
        # Start with start token
        out = self.word_to_idx["START"]
        # Until we get to the last word or reach our max length
        while out != self.word_to_idx["END"] and len(output) <= max_length:
            out = torch.tensor([[out]]).to(device)
            # Get the embedding of already constucted portion of the caption
            out = self.embedding(out)
            # Pass through LSTM
            out, hidden = self.lstm(out, hidden)
            # Get vector of most likely word values
            out = self.classifier(out)
            # Convert to probabilties
            out = self.softmax(out)
            # Get the most likely word
            out = torch.argmax(out, dim=-1)
            # Add most likely word to constucted caption
            out = out.squeeze().item()
            output.append(out)
        
        # Convert each token to a readable word
        predicted_caption = [self.idx_to_word[token] for token in output] 
        # Convert to string
        return " ".join(predicted_caption)


### Prediction Function

In [None]:
def predict_caption(image_file, data_set, show_actual=False, show_prediction = False, show_image=False):
    # Open image file
    input_image = Image.open(image_file).convert("RGB")
     
    # Transform image as done in training data
    input_tensor = image_transforms(input_image).unsqueeze(0)
    input_tensor = input_tensor.to(device)
    # Get the predictions from cnn of what is in the image
    features = cnn(input_tensor)
    # Generate text using LSTM of image caption
    pred_caption = lstm.predict(features)
    
    # Get the true value of the caption from data
    actual_caption = data_set[data_set["image"] == image_file]["caption"].values
    
    # Flags for output
    if show_prediction:
        print("PREDICTED: " + " ".join(pred_caption.split()[:-1]))
    if show_actual:
        print("ACTUAL: "+ actual_caption[0])
    if show_image:
        display(input_image)
    
    # Get words in list for BLEU score calculations
    actual_caption_words = [i.split() for i in actual_caption]
    # Return actual and predicted without the end token
    return actual_caption_words, pred_caption.split()[:-1]

# Training

In [None]:
# Define CNN and LSTM model
cnn = CNN(dropout=.2).to(device)
lstm = LSTM(vocab_size = vocab_size, word_to_idx = train_dataset.word_to_idx, idx_to_word = train_dataset.idx_to_word, dropout=.2).to(device)

# Define learning rate
lr = 0.001

# Define optimizer and loss function
optimizer = optim.Adam(lstm.parameters(), lr=lr, weight_decay=1e-6)
loss_func = nn.CrossEntropyLoss()

In [None]:
# Create Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Show number of batches in each dataloader
len(train_dataloader), len(val_dataloader)

In [None]:
# Function for calculating bleu score for model performance
def get_bleu_scores(dataset):
    # Get all images in dataset
    dataset_images = dataset['image'].unique()
    # For each image, get the predicted caption
    preds = [" ".join(predict_caption(i,dataset)[1]) for i in dataset_images]
    
    pred_df = pd.DataFrame()
    pred_df["image"] = dataset_images
    pred_df["pred"] = preds
    
    # Create 4 column dataframe with image, prediction, list of actual captions, and bleu score
    pred_df = pd.merge(pred_df, dataset.groupby("image")['caption'].apply(list), on="image", how="left")
    
    pred_df['bleu'] = pred_df.apply(lambda x: sentence_bleu(x.caption, x.pred), axis=1)
    
    return pred_df

In [None]:
print_every = 100

# Track loss of each batch
train_step_losses = []
val_step_losses = []

# Track loss of each epoch
train_epoch_losses = []
val_epoch_losses = []

# Track BLEU of each epoch
train_bleu_scores = []
val_bleu_scores = []

# Track training BLEU of each batch
train_batch_bleu_scores = []

# Count of number of batches ran
global_train_steps = 0
global_val_steps = 0

# Flag for calculating training BLEU for batches
calc_bleu_after_batch = False

# Get random flickr image in training set that will be tested every 100 batches
static_img_1 = train[train["image"].str.contains('flickr8k')].sample(n=1)["image"].values[0]
static_img_2 = train[train["image"].str.contains('flickr8k')].sample(n=1)["image"].values[0]

# Get random crime image in training set that will be tested every 100 batches
static_custom_img = train[train["image"].str.contains('custom-image')].sample(n=1)["image"].values[0]

# Flag for output
first_batch = True

# For each epoch
for epoch in range(1, epochs+1):
    # Create progress bar
    progress_train = tqdm(train_dataloader)
    # Track loss
    total_train_loss = 0
    # For each batch in training
    for batch_num, (img, caption) in enumerate(progress_train):
        # Zero out gradients
        optimizer.zero_grad()
        # Do not train cnn
        cnn.eval()
        # Train lstm
        lstm.train()
        
        # Send image and caption to cpu or cuda
        img = img.to(device)
        caption = caption.to(device)
        
        # Get the output of cnn
        features = cnn(img)
        # Send output of cnn and all but the last token
        input_caption = caption[:, :-1]
        output = lstm(features, input_caption)
        # Change dimension location
        output = output.permute(0, 2, 1)
        # All but first token
        target = caption[:, 1:]
        # Get the loss
        loss = loss_func(output, target)
        
        # Back Prop
        loss.backward() 
    
        optimizer.step()
        
        # For TPU
        # xm.optimizer_step(optimizer, barrier=True)
        
        # Add to progess bar
        progress_train.set_description(desc="Epoch " + str(epoch) + " - Train Loss: %.5f" % (loss.item()))
        total_train_loss += loss.item()
        train_step_losses.append(loss.item())
        global_train_steps += 1
        
        # Print exmaples of predictions every 100 batches
        if (batch_num+1) % print_every == 0:
            with torch.no_grad():
                # Do not change weights of LSTM
                lstm.eval()
                # Get random image
                rand_img = train.sample(n=1)["image"].values[0]
                # Run predictions to see progress
                predict_caption(static_img_1, train, show_prediction=True, show_image=first_batch, show_actual=first_batch)

                print("------------")
                predict_caption(static_img_2, train, show_prediction=True,show_image=first_batch, show_actual=first_batch)

                print("------------")
                predict_caption(static_custom_img, train, show_prediction=True,show_image=first_batch, show_actual=first_batch)

                print("------------")
                predict_caption(rand_img,train, show_prediction=True, show_image=True, show_actual=True)

                print("------------")
                if calc_bleu_after_batch:
                    train_batch_pred_df = get_bleu_scores(train)
                    avg_train_batch_bleu = train_batch_pred_df["bleu"].mean()
                    train_batch_bleu_scores.append(avg_train_batch_bleu)
                    print("Avg Training BLEU Score After Batch:", avg_train_batch_bleu)
                
                # Do not display image of static pictures again in batches after the first one
                first_batch = False

    # Get average loss over epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_epoch_losses.append(avg_train_loss)
    print("Avg Training Loss on Epoch " + str(epoch) + ": " + str(avg_train_loss))
    
    # Validate model performance
    cnn.eval()
    lstm.eval()
    # Get BLEU score of training set
    print("Running in-sample predictions...")
    train_pred_df = get_bleu_scores(train)
    avg_train_bleu = train_pred_df["bleu"].mean()
    train_bleu_scores.append(avg_train_bleu)
    print("Epoch Avg Training BLEU Score:", avg_train_bleu)
    
    # Similar process of training set
    print("Running Validation on Epoch "+ str(epoch))
    progress_val = tqdm(val_dataloader)
    total_val_loss = 0
    for batch_num, (img, caption) in enumerate(progress_val):
        lstm.eval()
        img = img.to(device)
        caption = caption.to(device)
        features = cnn(img)
        input_caption = caption[:, :-1]
        output = lstm(features, input_caption)
        output = output.permute(0, 2, 1)
        val_loss = loss_func(output, caption[:, 1:])
            
        progress_val.set_description(desc="Epoch " + str(epoch) + " - Val Loss: %.5f" % (val_loss.item()))
        
        total_val_loss += val_loss.item()
        val_step_losses.append(val_loss.item())
        global_val_steps += 1

    # Get BLEU score of validation set
    print("Running out-sample val predictions...")
    val_pred_df = get_bleu_scores(val)
    avg_val_bleu = val_pred_df["bleu"].mean()
    val_bleu_scores.append(avg_val_bleu)
    print("Avg Val BLEU Score:", avg_val_bleu)
    
    # Get random images to see the progress of model at end of epoch
    rand_img = val.sample(n=1)["image"].values[0]
    predict_caption(rand_img, val, show_prediction=True, show_image=True, show_actual=True)

    
    rand_img = val.sample(n=1)["image"].values[0]
    ref, candiate = predict_caption(rand_img, val, show_prediction=True, show_image=True, show_actual=True)

    
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_epoch_losses.append(avg_val_loss)

    print("Avg Val Loss on Epoch " + str(epoch) + ": " + str(avg_val_loss))
    
    # Print model performance graphs...was not able to run tensorboard on Kaggle
    plt.plot(list(range(global_train_steps)),train_step_losses)
    plt.xlabel('training batches')
    plt.ylabel('train loss')
    plt.title("Batch Training Loss")
    plt.show()
    plt.clf()
    
    if calc_bleu_after_batch:
        plt.plot(list(range(global_train_steps)),train_batch_bleu_scores)
        plt.xlabel('training batches')
        plt.ylabel('BLEU score')
        plt.title("Avg Train BLEU")
        plt.show()
        plt.clf()
    
    
    plt.plot(list(range(global_val_steps)),val_step_losses)
    plt.xlabel('val batches')
    plt.ylabel('val loss')
    plt.title("Batch Val Loss")
    plt.show()
    plt.clf()
    
    plt.plot(list(range(1,epoch+1)),train_epoch_losses, label='avg training loss')
    plt.plot(list(range(1,epoch+1)),val_epoch_losses, label='avg val loss')
    plt.xlabel('number of epochs')
    plt.ylabel('loss')
    plt.title("Train v Val Loss")
    plt.legend()
    plt.show()
    plt.clf()

    plt.plot(list(range(1,epoch+1)),train_bleu_scores, label='avg training bleu')
    plt.plot(list(range(1,epoch+1)),val_bleu_scores, label='avg val bleu')
    plt.xlabel('number of epochs')
    plt.ylabel('BLEU')
    plt.title("Avg Train v Val BLEU")
    plt.legend()
    plt.show()
    plt.clf()
    
    

In [None]:
# Save the models
torch.save(cnn, '/kaggle/working/cnn.pkl')
torch.save(lstm, '/kaggle/working/lstm.pkl')


### Results

In [None]:
# Load in saved models
cnn_temp = torch.load("/kaggle/input/final-cnn-lstm-models/cnn (2).pkl")
lstm_temp = torch.load('/kaggle/input/final-cnn-lstm-models/lstm (2).pkl')

In [None]:
# Do not train models
cnn.eval()
lstm.eval()

In [None]:
# Get the predictions results for each dataset
train_pred_df = get_bleu_scores(train)
val_pred_df = get_bleu_scores(val)
test_pred_df = get_bleu_scores(test)

In [None]:
# Visualize results with random image
def pull_random_pred(pred_df, crime=True,min_bleu_thresh = 0.0, max_bleu_thresh = 1.0):
    # If we want to look at custom crime data only
    if crime:
        pull_df = pred_df[pred_df["image"].str.contains("custom-images")]
    else:
        pull_df = pred_df.copy()
    
    # Get desired bleu score for analysis
    pull_df = pull_df[(pull_df["bleu"] >= min_bleu_thresh)&(pull_df["bleu"] <= max_bleu_thresh)]
    # Get random record
    row = pull_df.sample(n=1)
    
    # Display output
    print("file:", row["image"].item())
    display(Image.open(row["image"].item()))
    print("prediction:",row["pred"].item())
    print("actuals:", list(row["caption"]))
    print("BLEU Score", row["bleu"].item())
    print("\n")

In [None]:
pull_random_pred(test_pred_df, crime=True)

In [None]:
pull_random_pred(train_pred_df, crime=False)

In [None]:
pull_random_pred(val_pred_df, crime=False)

In [None]:
# Histogram of train BLEU scores
train_pred_df.bleu.hist(bins=10)

In [None]:
# Stats of train BLEU scores
train_pred_df.bleu.describe()

In [None]:
# Histogram of val BLEU scores
val_pred_df.bleu.hist(bins=10)

In [None]:
# Stats of val BLEU scores
val_pred_df.bleu.describe()

In [None]:
# Histogram of test BLEU scores
test_pred_df.bleu.hist(bins=10)

In [None]:
# Stats of test BLEU scores
test_pred_df.bleu.describe()

In [None]:
# Bad Test Captions
for _ in range(5):
    pull_random_pred(test_pred_df, crime=False, min_bleu_thresh = 0.0, max_bleu_thresh=0.3)

In [None]:
#Descent Test Captions
for _ in range(5):
    pull_random_pred(test_pred_df, crime=False, min_bleu_thresh = 0.3, max_bleu_thresh=0.7)

In [None]:
#Good Test Captions
for _ in range(5):
    pull_random_pred(test_pred_df, crime=False, min_bleu_thresh = 0.7, max_bleu_thresh=1.0)

In [None]:
#Crime Test Captions
for _ in range(5):
    pull_random_pred(test_pred_df, crime=True)