In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models
from collections import Counter
from skimage import io, transform
from torch.nn.utils.rnn import pack_padded_sequence
import matplotlib.pyplot as plt # for plotting
import numpy as np
from time import time
import collections
import pickle
import os
import nltk
import re

  from .collection import imread_collection_wrapper


In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: nvidia-smi: command not found


In [3]:
!pip install psutil
try:
  from psutil import virtual_memory
  ram_gb = virtual_memory().total / 1e9
  print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

  if ram_gb < 20:
    print('Not using a high-RAM runtime')
  else:
    print('You are using a high-RAM runtime!')
except Exception as e:
  print(e)

No module named 'psutil'


In [4]:
class Rescale(object):
    """Rescale the image in a sample to a given size.
    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        h, w = image.shape[:2]
        #print("TA RESCALE INPUT", image.shape)
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        #print("TA RESCALE OUTPUT", image.shape)
        return img

In [5]:
os.getcwd()

'/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4'

In [6]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # numpy image: H x W x C
        # torch image: C X H X W
        image = image.transpose((2, 0, 1))
        return torch.tensor(image)

In [7]:
IMAGE_RESIZE = (256, 256)
device = "cuda" if torch.cuda.is_available() else "cpu"
img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), transforms.RandomRotation(degrees = (90,90))])

# 'train': transforms.Compose([
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomRotation(degrees = (90,90)),
#     transforms.RandomRotation(degrees = (180,180)),
#     transforms.RandomRotation(degrees = (270,270)),
#     transforms.RandomVerticalFlip(p=1),
#     transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
# ])

print("Current device set to {}".format(device))
# DIR = '/content/drive/MyDrive/data/train_data_main/'
# DIR = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4/'
DIR = '/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/'

Current device set to cpu


In [8]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
phase = "Train"

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prakank/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/prakank/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.captions_file_path = captions_file_path
        self.raw_captions_dict = self.read_raw_captions()
        self.captions_dict = self.process_captions()
        self.vocab = self.generate_vocabulary()
    def read_raw_captions(self):
        # Dictionary with raw captions list keyed by image ids (integers)
        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                image_path = DIR + img_captions[0]
                
                image_path = '/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/data/' + img_captions[0]
                
                if os.path.exists(image_path):
                    captions_dict[img_captions[0]] = img_captions[1].lower()
                    
                # if len(captions_dict) == 5000:
                #     break
                    
                    
        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict

        # Do the preprocessing here
        # Can remove the stopwords and gibberish in the caption
        stop_words = stopwords.words('english')
        self.allowedLength = 7
        punctuation = list(string.punctuation)

        for key, value in raw_captions_dict.items():
            cleaned_caption = re.sub('[^A-Za-z0-9]+', ' ', value) #Extra space removal
            tokens = word_tokenize(cleaned_caption)
            # cleaned_tokens = [token for token in tokens if token not in stop_words and token not in punctuation] # Remove stopwords and punctuation
            cleaned_tokens = [token for token in tokens if token not in punctuation] # Remove stopwords and punctuation
            
            # cleaned_caption = "[START] " + " ".join(cleaned_tokens) + " [END]"
            # cleaned_caption = " ".join(cleaned_tokens)
            cleaned_caption = " ".join(cleaned_tokens) + " [END]"

            raw_captions_dict[key] = cleaned_caption        

        captions_dict = raw_captions_dict

        return captions_dict

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        captions_dict = self.captions_dict
        vocabulary = {}
        max_caption = 0
        idx = 1
        index_to_word = {}
        for key, value in captions_dict.items():
            val = value.split()
            max_caption = max(max_caption, len(val))

            for i in val:
                if i not in vocabulary.keys():
                    vocabulary[i] = idx
                    index_to_word[idx] = i
                    idx+=1
        self.max_caption = max_caption
        self.max_caption = (self.allowedLength+2)
        
        index_to_word[0] = "NIL"
        self.index_to_word = index_to_word
        # Generate the vocabulary
        print("Size of Vocabulary = {}".format(len(vocabulary)))
        return vocabulary


    def get_captions(self, tensor_tokens):
        caption = [self.index_to_word[int(x)] for x in tensor_tokens]
        return " ".join(caption)

    def captions_transform(self, img_caption):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        vocab = self.vocab

        caption = img_caption.split(" ")
        
        
        # print(img_caption, caption)

        caption_mapped = np.zeros(self.max_caption)
        for i in range(len(caption)):
            try: caption_mapped[i] = self.vocab[caption[i]]
            except: print(img_caption, caption, i)

        # caption_mapped = np.zeros((self.max_caption, len(self.vocab)))
        # for i in range(len(caption)):
        #     val = np.zeros(len(self.vocab))
        #     val[self.vocab[caption[i]]] = 1
        #     caption_mapped[i,:] = val 

        #captions_mapped = np.argmax(captions_mapped, axis = 1)
        
        return torch.LongTensor(caption_mapped)

# Set the captions tsv file path

# CAPTIONS_FILE_PATH = '/content/drive/MyDrive/data/train_text.tsv'
# CAPTIONS_FILE_PATH = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4/Train_text.tsv'

BASE_DIR = '/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/'
CAPTIONS_FILE_PATH = os.path.join(BASE_DIR, 'data', 'train_text.tsv')

captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)
# embedding_dim = 256

Size of Vocabulary = 7469


In [10]:
ls

Assignment-4.pdf                         q1_new.ipynb
[0m[01;34mdata[0m/                                    q2_prakank.ipynb
data_extraction.ipynb                    q2_prakank_new.ipynb
[01;34mMachine-Learning-Assignments-master[0m/     q2_temp.ipynb
Machine-Learning-Assignments-master.zip  seq2seq_attention.pdf
out1.ipynb                               starter_code.ipynb
q1.ipynb


In [11]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, captions_dict, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image paths (strings)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = captions_dict
        self.img_transform = img_transform
        self.captions_transform = captions_transform

        self.image_ids = list(captions_dict.keys())

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        image = io.imread(img_name)
        captions = self.captions_dict[img_name]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            captions = self.captions_transform(captions)

        sample = {'image': image, 'captions': captions}

        return sample

In [12]:
def collate_fn(batch):
  res = {}

  res['image'] = [sample['image'].unsqueeze(0) for sample in batch] 
  res['image'] = torch.cat((res['image']), dim=0)

  res['captions'] = [sample['captions'] for sample in batch]
  res['captions'] = torch.nn.utils.rnn.pad_sequence(res['captions'], batch_first=True)

  return res

In [13]:
#ENCODER
class Encoder(nn.Module):
    def __init__(self, embed_dim, trainCNN = False):
        super(Encoder, self).__init__()
        self.trainCNN = trainCNN

        # if not torch.cuda.is_available():
        self.inception = torchvision.models.inception_v3(pretrained=True, aux_logits = False)
        # else:
        #     self.inception = torchvision.models.inception_v3(pretrained=True, aux_logits = False).cuda()

        self.bn = nn.BatchNorm2d(embed_size, momentum=0.01)
        
        self.inception.fc = nn.Linear(in_features=self.inception.fc.in_features, out_features=embed_dim, bias = True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        
        # self.inception.fc.weight.data.normal_(0., 0.2)
        # self.inception.fc.bias.data.fill_(0)

    def forward(self, x):
        #print("Forward feeding")
        features = self.inception(x)
        #print("Resnet module op", x.shape)
        for name, param in self.inception.named_parameters():
            if "fc.weight" in name or "fc.bias" in name:
               param.requires_grad = True
            else:
                param.requires_grad = self.trainCNN
        return (self.dropout((self.relu(features))))

class AttentionBlock(nn.Module):
    def __init__(self, embed_dim, lstm_hidden_size, vocab_size, enc_dim=256):
        super(AttentionBlock, self).__init__()
        self.hidden_lin = nn.Linear(lstm_hidden_size, lstm_hidden_size)
        self.tanh = nn.Tanh()
        self.img_lin = nn.Linear(embed_dim, lstm_hidden_size)
        self.vocab_size = vocab_size
        self.softmax = nn.Softmax(dim=1)
        self.concat_lin = nn.Linear(lstm_hidden_size, 1)
        # self.alpha_map = nn.Linear()
        
    #     Attention 0 torch.Size([32, 256]) torch.Size([32, 512])
    #     Attention Hidden: torch.Size([32, 1, 512])
    #     Attention Img_s: torch.Size([32, 512])
    #     Attention att_: torch.Size([32, 32, 512])
    #     Attention e_: torch.Size([32, 32])
    
    def forward(self,image_features, hidden_state):
        
        hidden_h = self.hidden_lin(hidden_state).unsqueeze(1)
        
        # print('Attention Hidden:',hidden_h.shape)
        
        img_s = self.img_lin(image_features)
        
        # print('Attention Img_s:',img_s.shape)
        
        att_ = self.tanh(img_s + hidden_h)
        
        # print('Attention att_:',att_.shape)
        
        e_ = self.concat_lin(att_).squeeze(2)
        
        # print('Attention e_:',e_.shape)
        
        alpha = self.softmax(e_)
        context_vec = (image_features * alpha.unsqueeze(2)).sum(1)
        
        # print('Attention alpha: ', alpha.shape)
        return context_vec, alpha

class Decoder(nn.Module):
    def __init__(self, embed_dim, lstm_hidden_size, vocab_size, wordEmbeddingFilename=None, lstm_layers=1, enc_dim=256):
        super(Decoder, self).__init__()
        self.lstm_hidden_size = lstm_hidden_size
        self.vocab_size = vocab_size
        
        print("VOCAB SIZE = ", self.vocab_size)
        
        self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = lstm_hidden_size,
                            num_layers = lstm_layers, batch_first = True)
        
        self.lstmCell = nn.LSTMCell(embed_dim+embed_dim, lstm_hidden_size)
        
        self.attention = AttentionBlock(embed_dim, lstm_hidden_size, self.vocab_size)
        
        self.linear = nn.Linear(lstm_hidden_size, self.vocab_size)
        #self.embed = nn.Embedding.from_pretrained(init_weights)

        self.embed = nn.Embedding(self.vocab_size, embed_dim)
        self.embed.weight.data.uniform_(-0.1, 0.1)
        
        self.act= nn.Tanh()
        
        self.h = nn.Linear(embed_dim, lstm_hidden_size)
        self.c = nn.Linear(embed_dim, lstm_hidden_size)

        # self.embed = self.load_pre_trained(wordEmbeddingFilename)
        # self.embed = nn.Embedding.from_pretrained(self.vocab_size,embed_dim,padding_idx=0)
        
        self.sigmoid = nn.Sigmoid()
        self.gate = nn.Linear(lstm_hidden_size, embed_dim)

        self.dropout = nn.Dropout(0.2)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
        self.out = nn.Linear(lstm_hidden_size, self.vocab_size)

    def load_pre_trained(self, filename):
        # import gensim
        # from gensim.models.wrappers import FastText
        # model = FastText.load_fasttext_format('wiki.simple')
        # nn.Embedding.from_pretrained()
        pass
    
    def forward(self, img_feat, captions):
        # print('h started ...',img_feat.shape, img_feat.mean(dim=0).shape)
        
        # print(captions.shape)
        
        # h = self.act(self.h(img_feat.mean(dim=0)))
        h = self.act(self.h(img_feat))
        
        # print('h computed')
        
        # c = self.act(self.c(img_feat.mean(dim=0)))
        c = self.act(self.c(img_feat))
        
        
        # print('hc computed',h.shape,c.shape)
        
        max_len = captions_preprocessing_obj.max_caption
        embedding = self.embed(captions)
        
        # print('embedding generated', embedding.shape)
        
        out_matrix = torch.zeros(img_feat.shape[0], max_len ,self.vocab_size)
        alpha_matrix = torch.zeros(img_feat.shape[0], max_len ,img_feat.shape[1])
        
        # print('Out Matrix:', out_matrix.shape)
        # print('Alpha matrix:', alpha_matrix.shape)
        
        if torch.cuda.is_available():
            out_matrix = out_matrix.cuda()
            alpha_matrix = alpha_matrix.cuda()
        
        for i in range(max_len):
            # print('Attention',i, img_feat.shape, h.shape)
            context, alpha = self.attention(img_feat, h)
            
            # print('Context:{}, Alpha:{}'.format(context.shape, alpha.shape))
            gate_out = self.sigmoid(self.gate(h))
            
            # print('Gate:',gate_out.shape)
            
            context_gate = context * gate_out 
            in_ = torch.cat([embedding[:,i], context_gate],dim=1)
            
            # print('Context gate:',context_gate.shape)
            # print('in_shape:',in_.shape)
            
            # in_ -> 32*512 i.e. batch_szie * lstm_hidden_size
            # h,c -> batch_szie * lstm_hidden_size
            
            
            # h started ... torch.Size([32, 300]) torch.Size([300])
            # h computed
            # hc computed torch.Size([32, 512]) torch.Size([32, 512])
            # embedding generated torch.Size([32, 9, 300])
            # Out Matrix: torch.Size([32, 9, 2603])
            # Alpha matrix: torch.Size([32, 9, 300])
            # Attention 0 torch.Size([32, 300]) torch.Size([32, 512])
            # Context:torch.Size([32, 300]), Alpha:torch.Size([32, 32])
            # Gate: torch.Size([32, 300])
            # Context gate: torch.Size([32, 300])
            # in_shape: torch.Size([32, 600])
            # input has inconsistent input_size: got 600 expected 556
            
            
            # Lstm done
            # Dropout done
            # Out: torch.Size([32, 2511])
            # Alpha: torch.Size([32, 32])
            # The expanded size of the tensor (256) must match the existing size (32) at non-singleton dimension 1.  Target sizes: [32, 256].  Tensor sizes: [32, 32]
            
            
            
            h,c = self.lstmCell(in_, (h,c))
            
            # print('Lstm done, h:',h.shape,'c',c.shape)
            
            h = self.dropout(h)
            
            # print('Dropout done, h', h.shape)
            
            #h,c = self.lstm(in_, (h,c))
            out = self.out(h)
            
            # print('Out:',out.shape)
            # print('Alpha:',alpha.shape)
            
            out_matrix[:,i,:]=out
            # alpha_matrix[:,i,:]=alpha
            
        return out_matrix
        
    # def forward(self, image_features, image_captions):
    #     image_features = image_features.unsqueeze(1)
    #     embeddings = self.dropout(self.embed(image_captions))
    #     #print(embeddings.shape, image_features.shape)
    #     embeddings = torch.cat((image_features, embeddings[:,:-1]), dim = 1)
    #     #embeddings = torch.cat((image_features, embeddings), dim = 1)
    #     hiddens, _ = self.lstm(embeddings)
    #     outputs = self.linear(hiddens)
        
    #     return outputs

In [14]:
class ImageCaptionsNet(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ImageCaptionsNet, self).__init__()              
        self.Encoder = Encoder(embed_dim = embed_size)
        self.Decoder = Decoder(embed_size, hidden_size, vocab_size, num_layers)    
        

    def forward(self, img_batch, cap_batch):
        x = self.Encoder(img_batch)
        
        #x = x.long().numpy()
        # print(x.shape, "hihi")

        out = self.Decoder(x, cap_batch)
        return out

device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
# Pratyush
# IMAGE_DIR = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4'


# For google colab
# BASE_DIR = '/content/drive/MyDrive/'
# os.chdir(os.path.join(BASE_DIR,'data','train_data_main'))


# Prakhar
os.chdir(os.path.join(BASE_DIR, 'data'))
IMAGE_DIR = os.path.join(BASE_DIR, 'data')



import os
# import gensim



embed_size = 300
hidden_size = 512
num_layers = 9

vocab_size = len(captions_preprocessing_obj.vocab)
net = ImageCaptionsNet(embed_size, hidden_size, vocab_size, num_layers)

if torch.cuda.is_available():
    net = net.to(torch.device("cuda:0"))
else:
    net = net.to(torch.device("cpu"))


# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, captions_preprocessing_obj.captions_dict, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)
print("Train Dataset loaded")
# Define your hyperparameters
NUMBER_OF_EPOCHS = 1
LEARNING_RATE_D = 5e-2
LEARNING_RATE_E = 5e-2
BATCH_SIZE = 100
NUM_WORKERS = 0 # Parallel threads for dataloading

loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer_decoder = optim.Adam(list(net.Decoder.parameters()), lr=LEARNING_RATE_D, betas=(0.9, 0.999))
optimizer_encoder = optim.Adam(list(net.Encoder.parameters()), lr=LEARNING_RATE_E, betas=(0.9, 0.999))
print("Optimizer loaded")
# Creating the DataLoader for batching purposes
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, collate_fn = collate_fn)
print("Train Loader loaded")
torch.backends.cudnn.benchmark = True
start = time()
loss_list = []

# encoder_arch.load_state_dict(torch.load('/content/drive/My Drive/ML_Assignment4/encode-state1-scratch.pkl'))

VOCAB SIZE =  7469
Train Dataset loaded
Optimizer loaded
Train Loader loaded


In [45]:
for epoch in range(NUMBER_OF_EPOCHS):
    print("Epoch {}".format(epoch+1))
    iteration = 0
    for batch_idx, sample in enumerate(train_loader):
        net.Encoder.zero_grad()
        net.Decoder.zero_grad()
        optimizer_decoder.zero_grad()
        optimizer_encoder.zero_grad()
        image_batch, captions_batch = sample['image'], sample['captions']

        #If GPU training required
        image_batch = image_batch.float()
        #captions_batch = captions_batch.float()

        if torch.cuda.is_available():
            image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()
        
        # if (iteration == 2):
        #     break
        # output_captions = net(image_batch, captions_batch)

        try:
            output_captions = net(image_batch, captions_batch)
        except Exception as e:
            print(e)
            print("---Error {}".format(batch_idx))
            break

        #print(output_captions.shape, captions_batch.shape)
        if not torch.cuda.is_available():
            loss = loss_function(output_captions.reshape(-1, output_captions.shape[2]), captions_batch.reshape(-1))
        else:
            loss = loss_function(output_captions.reshape(-1, output_captions.shape[2]), captions_batch.reshape(-1))

        loss_list.append(loss.item())
        loss.backward()
        optimizer_encoder.step()
        optimizer_decoder.step()
        
        print("Iteration: {}, Loss: {}, TimeElapsed: {}Min".format(iteration+1, round(loss.item(), 2), round((time()-start)/60,2), ))
        iteration+=1

        if (iteration%5 == 0):
            torch.save(net.state_dict(), './encode-state{}_{}-scratch.pkl'.format(str(epoch+1),str(iteration)))
            # torch.save(decode_arch.state_dict(), './decode-state{}-scratch.pkl'.format(str(epoch+1) + str(iteration)))
        if iteration == 50:
            break

Epoch 1
Iteration: 1, Loss: 8.91, TimeElapsed: 1.31Min
Iteration: 2, Loss: 7.17, TimeElapsed: 1.61Min
Iteration: 3, Loss: 13.72, TimeElapsed: 1.98Min
Iteration: 4, Loss: 18.41, TimeElapsed: 2.35Min
Iteration: 5, Loss: 21.06, TimeElapsed: 2.76Min
Iteration: 6, Loss: 24.01, TimeElapsed: 3.08Min


In [16]:
# net.load_state_dict(torch.load('/content/drive/MyDrive/data/train_data_main/encode-state1-scratch.pkl'))
net.load_state_dict(torch.load(os.path.join(BASE_DIR, 'data', 'encode-state1_6-scratch.pkl')))
net.eval()

ImageCaptionsNet(
  (Encoder): Encoder(
    (inception): Inception3(
      (Conv2d_1a_3x3): BasicConv2d(
        (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (Conv2d_2a_3x3): BasicConv2d(
        (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (Conv2d_2b_3x3): BasicConv2d(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
      (Conv2d_3b_1x1): BasicConv2d(
        (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine

## Prediction

In [28]:
def caption_image(net, image, cap_obj, max_length = 10):
    result_cap = []
    #print("initial img ", image.shape)
    with torch.no_grad():
        x = net.Encoder(image)
        x = x.unsqueeze(0)
        #x = self.Encoder(image)
        states = None
        
        h = net.Decoder.act(net.Decoder.h(x))
        c = net.Decoder.act(net.Decoder.c(x))

        h = h.squeeze(0)
        c = c.squeeze(0)
        
        prev_word = 'start'
        
        freq = {}

        for _ in range(max_length):
            
            # print("X new = ", x.shape) # 1, 1, 300
            
            prev_word_idx = 0
            if prev_word in cap_obj.vocab:
                prev_word_idx = cap_obj.vocab[prev_word]
            
            # print(self.Decoder.embed( torch.tensor(np.asarray(prev_word_idx))).shape )
            
            if torch.cuda.is_available():
                embedding_pre_word = ((net.Decoder.embed( torch.tensor(np.asarray(prev_word_idx)).cuda() )).unsqueeze(0)).unsqueeze(0)
            else:
                embedding_pre_word = ((net.Decoder.embed( torch.tensor(np.asarray(prev_word_idx)) )).unsqueeze(0)).unsqueeze(0)
            
            # print(embedding_pre_word.shape)
            
            x = torch.cat([x, embedding_pre_word],dim=2)
            
            x = x.squeeze(0)
            
            # hiddens, states = self.Decoder.lstm(x, states)
            
            
            # print('Before LSTMcell:',x.shape,h.shape,c.shape)
            
            h, s = net.Decoder.lstmCell(x, (h,c))
            
            # print("Hiddens = ", h.shape)
            # print('States:', s.shape)
            
            output = net.Decoder.linear(h)

            # print('Output:',output.shape)
            # output_temp = sorted(output.squeeze(0), reverse=True)

            predicted_ = (output.squeeze(0)).argsort()
            # print(predicted_)
            #predicted_ = np.argsort(np.max(output, axis = 0))

            word_index = -1
            while (cap_obj.index_to_word[int(predicted_[word_index])] == '[START]'):
            #or (cap_obj.index_to_word[int(predicted_[word_index])] == '[END]'):
                word_index-=1
            predicted = predicted_[word_index]
            
            prev_word = cap_obj.index_to_word[int(predicted)]
            
            # print(predicted_[-3:])
            #print("OT, PD", output.shape, predicted.shape)

            # 1 * 7356
            result_cap.append(int(predicted))
            x = net.Decoder.embed(predicted).unsqueeze(0)
            x = x.unsqueeze(0)
            
            if prev_word in freq:
                freq[prev_word] += 1
            else:
                freq[prev_word] = 1

            if (prev_word == '[END]') or freq[prev_word] == 3:
                break
            
            if len(result_cap) > 2 and result_cap[-1] == result_cap[-2] and result_cap[-3] == result_cap[-2]:
                result_cap.pop()
                break
            
    return [cap_obj.index_to_word[int(idx)] for idx in result_cap]

In [29]:
class TestDatasetLoader(Dataset):
    
    def __init__(self, img_dir, img_transform):
        """
        Args:
            img_dir (string): Directory with all the test images.            
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.
        """
        self.img_dir = img_dir
        self.img_transform = img_transform
        
        self.image_ids = ['test_data/test' + str(i) + '.jpg' for i in range(1, 5001)]
        
    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        image = io.imread(img_name)
        
        if self.img_transform:
            image = self.img_transform(image)
        angle_in_degrees = 45

        #output = torch.from_numpy(ndimage.rotate(alpha, angle_in_degrees, reshape=False))
        # sample = {
        #     'top': image,
        #     'left': torch.from_numpy(ndimage.rotate(image, 90, reshape=False)),
        #     'bottom': torch.from_numpy(ndimage.rotate(image, 180, reshape=False)),
        #     'right': torch.from_numpy(ndimage.rotate(image, 270, reshape=False))
        #     }
        sample = {}
        sample['image'] = image # 3* 256 * 256
        sample['image_id'] = img_name
        return sample

In [30]:
# a = np.zeros(1)
# a = a.astype('int')
# net.Decoder.embed(torch.tensor(a)).shape
BASE_DIR
os.getcwd()

'/home/prakank/IIT Delhi/3rd_year/Sem5/COL774_Machine_Learning/COL774-Machine-Learning-Assignments/Assignment-4/data'

In [31]:
# TEST_IMAGE_DIR = '/Users/pratyushsaini/Documents/Semester 5/COL774/Assignment-4'
# TEST_IMAGE_DIR = os.path.join(BASE_DIR, 'data')

# Google colab
os.chdir(os.path.join(BASE_DIR,'data'))
TEST_IMAGE_DIR = os.path.join(BASE_DIR, 'data')

test_img_transform = transforms.Compose([Rescale(IMAGE_RESIZE), ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Applied sequentially

# Creating the Dataset
test_dataset = TestDatasetLoader(TEST_IMAGE_DIR, img_transform=test_img_transform)

test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=NUM_WORKERS)

#output_caption = net.predict(device, test_loader)

In [32]:
pred_caps = {}
for batch_idx, sample in enumerate(test_loader):
        #print(, batch_idx)
        
        # print(sample)
        
        image = sample['image']

        if torch.cuda.is_available():
            image = image.cuda()
        
        image_name = sample['image_id']
        #print("ld", image.shape)
        image = image.float()
        #print("Lolxd" , image.shape)
        caption_pred = caption_image(net, image, captions_preprocessing_obj, max_length = 10)
        #print(np.asarray(caption_pred).shape)
        caption_pred = " ".join(caption_pred)
        cap = caption_pred.replace("[START]","").replace("[END]","")
        print("Image_idx ", batch_idx,": ", caption_pred)
        # print("Image_idx ", batch_idx)
        # print("Predicted",batch_idx, pred_cap)

        # print(image_name)

        # print(type(caption_pred),type(cap))

        pred_caps[image_name[0]] = cap
        
        if batch_idx == 200:
                break

Image_idx  0 :  driving south chair carrier chair led chair
Image_idx  1 :  driving south chair carrier chair led chair
Image_idx  2 :  driving south chair led chair led chair
Image_idx  3 :  driving south carrier chair led chair led chair
Image_idx  4 :  authority twenty carrier chair led chair led chair
Image_idx  5 :  cartoon south carrier chair led chair led chair
Image_idx  6 :  electrical south chair carrier chair led chair
Image_idx  7 :  driving south carrier chair led chair led chair
Image_idx  8 :  authority south carrier chair led chair led chair
Image_idx  9 :  driving south capes chair led chair led chair
Image_idx  10 :  electrical south carrier chair led chair led chair
Image_idx  11 :  electrical south carrier chair led chair led chair
Image_idx  12 :  driving south carrier chair led chair led chair
Image_idx  13 :  authority south carrier chair led chair led chair
Image_idx  14 :  driving south chair carrier chair led chair
Image_idx  15 :  driving south chair led chai

In [None]:
# Image_idx  0 :  bicyclers redheaded beverages guests
# Image_idx  1 :  bicyclers pressure beverages guests
# Image_idx  2 :  fedora beverages
# Image_idx  3 :  blesses guests beverages
# Image_idx  4 :  treads 8 beverages
# Image_idx  5 :  torn beverages
# Image_idx  6 :  bicyclers beverages
# Image_idx  7 :  bicyclers guests beverages
# Image_idx  8 :  yawing suits
# Image_idx  9 :  game suits guests
# Image_idx  10 :  bicyclers beverages
# Image_idx  11 :  tubing beverages
# Image_idx  12 :  participating beverages
# Image_idx  13 :  drawings 8 guests
# Image_idx  14 :  bicyclers 8 guests
# Image_idx  15 :  blesses beverages
# Image_idx  16 :  soaking purplish beverages
# Image_idx  17 :  motor 8 beverages
# Image_idx  18 :  mucky suits beverages
# Image_idx  19 :  toes 8 beverages
# Image_idx  20 :  blesses guests beverages

In [None]:
import csv
with open('test_text.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for img in sorted(pred_caps, key=lambda x:int( (x[14:])[:len(x[14:])-4]) ):
        writer.writerow([img,pred_caps[img]])

In [None]:
sc = torch.tensor(1221)
bc = (sc.detach().numpy())
print(bc,type(bc))
captions_preprocessing_obj.index_to_word[int(sc)]

[0 3 1 2 5 4]


In [None]:
a = np.asarray([1, 3, 4, 1, 10, 9])
print(a.argsort())