In [1]:
import torch
import torch.nn as nn
from torchvision.models import resnet50
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
from torchvision import transforms

import json
import re
import string
import nltk
from nltk.tokenize import word_tokenize
import gensim
import pandas as pd
import cv2
from PIL import Image
import numpy as np
from tensorboardX import SummaryWriter

import os.path

torch.random.manual_seed(1)



<torch._C.Generator at 0x201402f60b0>

In [2]:
#paths
image_path = '../Dataset/MSCOCO/image/'
train_json = '../Dataset/MSCOCO/annotations_trainval2014/annotations/captions_train2014.json'
test_json = '../Dataset/MSCOCO/annotations_trainval2014/annotations/captions_val2014.json'
word2VecBin_path = '../../../Downloads/GoogleNews-vectors-negative300.bin'  

In [3]:
isgpu = torch.cuda.is_available()
if isgpu:
    device = 'cuda'
else:
    device = 'cpu'

#Tensorboard writer
writer = SummaryWriter()

In [None]:
#Dataset

def create_dataset_pd(train_json, test_json):
    train_dataset = json.load(open(train_json, 'r'))
    test_dataset = json.load(open(test_json, 'r'))

    td_img = pd.DataFrame(test_dataset['images'])
    td_ann = pd.DataFrame(test_dataset['annotations'])
    test_p = pd.merge(td_img, td_ann, left_on='id', right_on='image_id')
    test_p.drop_duplicates('image_id', inplace=True)

    train_img = pd.DataFrame(train_dataset['images'])
    train_ann = pd.DataFrame(train_dataset['annotations'])
    train_p = pd.merge(train_img, train_ann, left_on='id', right_on='image_id')
    train_p.drop_duplicates('image_id', inplace=True)
    dataset = train_p.append(test_p, ignore_index=True)
    dataset.drop(columns=['coco_url', 'date_captured',  'flickr_url', 'height',
           'id_x', 'license', 'width', 'id_y'], inplace=True)
    return dataset

dataset = create_dataset_pd(train_json, test_json)

pd_train_dataset = dataset.iloc[:-10000,:]
pd_val_dataset = dataset.iloc[-10000:-5000,:]
pd_test_dataset = dataset.iloc[-5000:,:]

In [12]:
assert pd_train_dataset.shape[0] == 113287

In [6]:
#preprocessing        
word2Vec = gensim.models.KeyedVectors.load_word2vec_format(word2VecBin_path, binary=True)

#Remove this 
train_dataset = json.load(open(train_json, 'r'))
test_dataset = json.load(open(test_json, 'r'))

def preprocess(x):
    #lower
    x = x.lower()
    #x = re.sub(r'\d+','', x) #remove numbers
    x = x.translate(str.maketrans('', '', string.punctuation))#remove punctuation
    x = x.strip() #whitespace
    return word_tokenize(x)
#idx zero - reverse for zero padding
idx=1
vocab = set()
word2idx= {}
for dataset in (train_dataset, test_dataset):
    for entry in dataset['annotations']:
        text = entry['caption']
        for word in preprocess(text):
            if  word not in word2idx and word in word2Vec:
                word2idx[word] = idx
                vocab.add(word)
                idx +=1
                #print(min(list(word2idx.values())))

weight_matrix = torch.zeros((len(vocab)+1, 300)) # weight matrix's first entry will be for zero index
for word in word2idx.keys():
    weight_matrix[word2idx[word]] = torch.from_numpy(word2Vec[word])

In [17]:
#Image Data preprocessing

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_path, dataset, imgSize, transforms=None):
        super(CustomDataset, self).__init__()
        self.imgSize = imgSize
        self.transforms = transforms
        self.image_path = image_path
#         image = pd.DataFrame(dataset['images'])
#         ann = pd.DataFrame(dataset['annotations'])
        
#         self.data = pd.merge(left=image, right=ann, left_on='id', right_on='image_id')
#         self.data.drop(columns=['coco_url', 'date_captured',  'flickr_url', 'height',
#            'id_x', 'license', 'width', 'id_y'], inplace=True)
#         self.data.drop_duplicates('image_id', inplace=True)
#         self.data.index = range(len(self.data))
        self.data = dataset
        
    def __getitem__(self, index):
        
        #Get Image
        path = self.image_path + 'train2014/' + self.data.loc[index]['file_name']
        if (not os.path.isfile(path)):
            path = self.image_path + 'val2014/' + self.data.loc[index]['file_name']
        img = cv2.imread(path)
        img_w, img_h = img.shape[1], img.shape[0]
        w, h = 256,256
        new_w = round(img_w * min(w/img_w, h/img_h))
        new_h = round(img_h * min(w/img_w, h/img_h))
        resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
        if transforms is not None:
            img = self.transforms(Image.fromarray(resized_image)) #Input shoudl be PIL Image
        #img = cv2.resize(resized_image, self.imgSize)
        img = np.asarray(img)
        img = img[:,:,::-1].transpose((2,0,1)).copy()        
        img = torch.from_numpy(img).float().div(255.0)
        
        #Get textIndexs for embedding layer
        caption = self.data.loc[index]['caption']
        words = preprocess(caption)[:32]
        txtIndex = [0 for _ in range(32)] #32- paper 
        for w in range(len(words)):
            word = words[w]
            if word in vocab:
                txtIndex[w]  = word2idx[word]
        wordIndex = torch.LongTensor(txtIndex)   
        
        return img, wordIndex, index         #label = index
    
    def __len__(self):
        return self.data.shape[0]

#transformation applied
transformations = transforms.Compose([#transforms.Resize((224,224), interpolation=3),
                                       transforms.RandomHorizontalFlip(p=0.5),
                                       transforms.RandomResizedCrop(224,interpolation=3),
                                       ])

train_ds = CustomDataset(image_path, pd_train_dataset, (224,224), transformations)
trainLoader = DataLoader(train_ds, shuffle=True, batch_size=4)

# eval_ds = CustomDataset(dataset, (224,224), transformations)
# evalLoader = DataLoader(eval_ds, shuffle=True, batch_size=4)

In [8]:
#Model
class ImageCNN(nn.Module):
    
    def __init__(self, stageI=True):
        super(ImageCNN, self).__init__()
        re = resnet50(pretrained=True)
        #remove the last classification layer
        self.resnet = nn.Sequential(*list(re.children())[:-1])
#         if isgpu:
#             self.resnet.to('cuda')
        #During StageI training -> resnet weights are fixed with pretrained weights
        if stageI: 
            for weights in self.resnet.parameters():
                weights.requires_grad_(False)
        
        self.fc = nn.Linear(2048, 2048)
        self.bn = nn.BatchNorm1d(2048)
        self.relu = nn.ReLU()
        self.dropOut = nn.Dropout(0.8)
        
    def forward(self, x):
        x = self.resnet(x) # (N,2048,1,1)
        x = torch.flatten(x,1) # (N,2048)
        x = self.relu(self.bn(self.fc(x)))
        x = self.relu(self.bn(self.fc(x))) # As per their MATLAB implementation
        return self.dropOut(x)

##Test Model -> Input (N,3,224,224) and Output (N,2048)
# net1 = ImageCNN()
# x1= torch.rand((2,3,224,224))
# y1 = net1(x1)
# print(y1.shape)

In [9]:
#textCNN -> The input should be the output of word2vec (N,300,32,1)
class BasicBlockText(nn.Module):
    
    def __init__(self, input_channel, intermediate_channel):
        super(BasicBlockText, self).__init__()
        self.bbConv1 = nn.Conv2d(input_channel, intermediate_channel, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.bbBatchNorm1 = nn.BatchNorm2d(intermediate_channel)
        self.relu = nn.ReLU()
        
        self.bbConv2 = nn.Conv2d(intermediate_channel, intermediate_channel, 
                                 kernel_size=(1,2), stride=(1,1), 
                                 padding=(0,1),bias=False, dilation = (1,2))
        self.bbBatchNorm2 = nn.BatchNorm2d(intermediate_channel)
        
        self.bbConv3 = nn.Conv2d(intermediate_channel, input_channel, kernel_size=(1,1), stride=(1,1), padding=(0,0), bias=False)
        self.bbBatchNorm3 = nn.BatchNorm2d(input_channel)
        
    def forward(self, x):
        identity  = x
        out = self.relu(self.bbBatchNorm1(self.bbConv1(x)))
        out = self.relu(self.bbBatchNorm2(self.bbConv2(out)))
        out = self.bbBatchNorm3(self.bbConv3(out))
        
        out += identity
        return self.relu(out)

        
class textCNN(nn.Module):
    
    def __init__(self, input_channel): #300
        super(textCNN, self).__init__()
        self.relu = nn.ReLU()
        #self.word2Vec = nn.Embedding.from_pretrained(self.__load_word2vec(path))
        
        self.emb_layer,_,_ = self.create_emb_layer(weight_matrix)
        #--------First CNN block----------------
        self.b1Conv1_0 = nn.Conv2d(input_channel, 128, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b1bn1_0 = nn.BatchNorm2d(128)
        
        self.b1Conv2 = nn.Conv2d(128, 128, 
                                 kernel_size=(1,2), stride=(1,1), 
                                 padding=(0,1), bias=False, dilation=(1,2))
        self.b1bn2 = nn.BatchNorm2d(128)
        
        self.b1Conv3 = nn.Conv2d(128, 256, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b1bn3 = nn.BatchNorm2d(256)
        
        #input here too
        self.b1Conv1_1 = nn.Conv2d(input_channel, 256,
                               kernel_size=(1,1), stride=(1,1), 
                               padding=(0,0), bias=False)
        self.b1bn1_1 = nn.BatchNorm2d(256)
        
        #Adding first basicblock (in matlab code i=2:3)
        self.layer1  = self.__make_layer(input_channel=256, intermediate_channel=64, 
                                     num_blocks=2)
        
        #--------Second CNN block----------------
        self.b2Conv1_0 = nn.Conv2d(256, 512, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b2bn1_0 = nn.BatchNorm2d(512)
        
        self.b2Conv2 = nn.Conv2d(512, 512, 
                                 kernel_size=(1,2), stride=(2,2), 
                                 padding=(0,1), bias=False, dilation= (1,2))
        self.b2bn2 = nn.BatchNorm2d(512)
        
        self.b2Conv3 = nn.Conv2d(512, 512, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b2bn3 = nn.BatchNorm2d(512)
        
        self.b2Conv1_1 = nn.Conv2d(256, 512, 
                                 kernel_size=(1,1), stride=(2,2), 
                                 padding=(0,0), bias=False)
        self.b2bn1_1 = nn.BatchNorm2d(512)
        
        #Add second basicblock (in matlb i = 2:4)
        self.layer2 =  self.__make_layer(input_channel=512, intermediate_channel=128, 
                                     num_blocks=3)
        
        #--------Third CNN block----------------
        self.b3Conv1_0 = nn.Conv2d(512, 1024, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b3bn1_0 = nn.BatchNorm2d(1024)
        
        self.b3Conv2 = nn.Conv2d(1024, 1024, 
                                 kernel_size=(1,2), stride=(2,2), 
                                 padding=(0,1), bias=False, dilation= (1,2))
        self.b3bn2 = nn.BatchNorm2d(1024)
        
        self.b3Conv3 = nn.Conv2d(1024, 1024, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b3bn3 = nn.BatchNorm2d(1024)
        
        self.b3Conv1_1 = nn.Conv2d(512, 1024, 
                                 kernel_size=(1,1), stride=(2,2), 
                                 padding=(0,0), bias=False)
        self.b3bn1_1 = nn.BatchNorm2d(1024)
        
        #Add third basicblock (in matlb i = 2:6)
        self.layer3 =  self.__make_layer(input_channel=1024, intermediate_channel=256, 
                                     num_blocks=5)
        
        #------------------
        self.b4Conv1_0 = nn.Conv2d(1024, 2048, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b4bn1_0 = nn.BatchNorm2d(2048)
        
        self.b4Conv2 = nn.Conv2d(2048, 2048, 
                                 kernel_size=(1,2), stride=(1,1), 
                                 padding=(0,1), bias=False, dilation= (1,2))
        self.b4bn2 = nn.BatchNorm2d(2048)
        
        self.b4Conv3 = nn.Conv2d(2048, 2048, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b4bn3 = nn.BatchNorm2d(2048)
        
        self.b4Conv1_1 = nn.Conv2d(1024, 2048, 
                                 kernel_size=(1,1), stride=(1,1), 
                                 padding=(0,0), bias=False)
        self.b4bn1_1 = nn.BatchNorm2d(2048)
        
        #------
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc1 = nn.Linear(2048,2048)
        self.fc1_bn = nn.BatchNorm1d(2048)
        self.dropout = nn.Dropout(0.8)
        
    def __load_word2vec(self, path):
        model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    
    def create_emb_layer(self, weights_matrix, non_trainable=False):
        num_embeddings, embedding_dim = weights_matrix.shape
        emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim)
        emb_layer.load_state_dict({'weight': weights_matrix})
        if non_trainable:
            emb_layer.weight.requires_grad = False

        return emb_layer, num_embeddings, embedding_dim

       
    def __make_layer(self, input_channel, intermediate_channel, num_blocks):
        layers = []
        for i in range(num_blocks):
            layers.append(BasicBlockText(input_channel, intermediate_channel))
        return nn.Sequential(*layers)  

    def forward(self, x): #input x: (N, 32)
        x = self.emb_layer(x)# output-> (4,32,300)
        x = x.permute(0,2,1).unsqueeze(2)
        #--------------------
        identity = x
        out = self.relu(self.b1bn1_0(self.b1Conv1_0(x)))
        out = self.relu(self.b1bn2(self.b1Conv2(out)))
        out = self.b1bn3(self.b1Conv3(out))
        #print(out.shape)
        out2 = self.b1bn1_1(self.b1Conv1_1(identity))
        #print(out2.shape)
        out2 +=out
        out2 = self.relu(out2) # (N, 256, 1,32)
        #Add 2 basicblock
        out2 = self.layer1(out2) # (N,256,1,32)
        #------------------
        identity = out2
        out3 = self.relu(self.b2bn1_0(self.b2Conv1_0(out2)))
        out3 = self.relu(self.b2bn2(self.b2Conv2(out3)))
        out3 = self.b2bn3(self.b2Conv3(out3))
        
        out4 = self.b2bn1_1(self.b2Conv1_1(identity))
        out4 += out3
        out4 = self.relu(out4)
        #Add 3 basicblocks
        out4 = self.layer2(out4)## (N,512,1,16)
        #-------------------------------
        identity = out4
        out5 = self.relu(self.b3bn1_0(self.b3Conv1_0(out4)))
        out5 = self.relu(self.b3bn2(self.b3Conv2(out5)))
        out5 = self.b3bn3(self.b3Conv3(out5))
        
        out6 = self.b3bn1_1(self.b3Conv1_1(identity))
        out6 += out5
        out6 = self.relu(out6)
        #Add 5 basicblocks
        out6 = self.layer3(out6)## (N,1024,1,8)
        #---------------------------------------
        identity = out6
        out7 = self.relu(self.b4bn1_0(self.b4Conv1_0(out6)))
        out7 = self.relu(self.b4bn2(self.b4Conv2(out7)))
        out7 = self.b4bn3(self.b4Conv3(out7))
        
        out8 = self.b4bn1_1(self.b4Conv1_1(identity))
        out8 += out7
        out8 = self.relu(out8)
        #-------------
        out8 = self.avgpool(out8)
        out8 = torch.flatten(out8,1)
        out8 = self.dropout(self.relu(self.fc1_bn(self.fc1(out8))))
        return out8
        
##Test TextCNN
# net2 = textCNN(300, '../../../Downloads/GoogleNews-vectors-negative300.bin')
# x2 = torch.rand(2,300,1,32)
# y2 = net2(x2)
# print(y2.shape) # (N,2048)

In [13]:
#implement weight sharing class
class Model(nn.Module):
    def __init__(self, output, stageI=True):
        super(Model, self).__init__()
        self.weights = torch.rand((output, 2048)).to(device) #out_features, in_features
        self.imageCNN = ImageCNN(stageI)
        self.textCNN =  textCNN(300)
        
    def forward(self, img, txt):
        img_out = self.imageCNN(img) # (N,2048)
        txt_out = self.textCNN(txt) # (N,2048)
        return F.linear(img_out, self.weights), F.linear(txt_out, self.weights)

##Test whole model
# net = Model()
# img = torch.rand((2,3,224,224))
# txt = torch.rand((2,300,1,32))
# fc_img, fc_txt = net(img, txt)
# print(fc_img.shape) #(N,113287)
# print(fc_txt.shape) #(N,113287)

#define net
net = Model(output=113287).to(device)

In [14]:
#define loss
criteria = nn.CrossEntropyLoss()

#optimizer
optim = optim.Adam(net.parameters(), lr = 0.001)

In [21]:
#training
net.train()
epochs = 250
loss = 0.0
for e in range(epochs):
    for i, (img, txt, label) in enumerate(trainLoader):
        print(img.shape, txt.shape, label) #-> test successful
        if isgpu:
            img = img.to(device)
            txt = txt.to(device)
            label = label.to(device)
        img_out, txt_out = net(img, txt)
        #print(img_out.shape, txt_out.shape)
        
        optim.zero_grad()
        imageLoss = criteria(img_out, label)
        textLoss = criteria(txt_out, label)
        loss =  imageLoss + textLoss
        
        # Zero gradients, perform a backward pass, and update the weights.
        loss.backward()
        optim.step()
        
        writer.add_scalar('data/imageLoss', imageLoss.item(), i)
        writer.add_scalar('data/textLoss', textLoss.item(), i)
        if (i%1000 ==0):
            print('Epoch {} loss at {} iteration: {}'.format(e, i, loss.item()))
#         if (i%200):
#             net.eval()
#             eval(net)
#             net.train()
        
# export scalar data to JSON for external processing
writer.export_scalars_to_json("./all_scalars.json")
writer.close()   

torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([35585, 57501, 22780, 54645])
Epoch 0 loss at 0 iteration: 83.27410888671875
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([57239, 67748, 81313, 42985])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 45831, 106536,  95553,  19594])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 1953, 63891, 59227, 86525])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 75057,  36998,  59745, 101981])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 82328,  68578,  63715, 110933])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([57622, 99999, 74712,  4696])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 66289,  34882, 111510, 111927])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([34226, 55159, 47663, 12227])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([85977, 60351, 39941, 66207])
torch.Size([4, 3, 224, 224]) torch.Size([4, 32]) tensor([ 50494,  78235,   21

KeyboardInterrupt: 

In [19]:
x = torch.rand((4,32,300))

In [None]:
x.permute(0,2,1).unsqueeze(2).shape

In [None]:
train_dataset['annotations'], test_dataset['annotations']

In [None]:
test_p.shape

In [None]:
train_p.shape

In [None]:
82783+40504