In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.init as init
import numpy as np
from torch.autograd import Variable
from torch.nn.utils import clip_grad_value_
import os
import glob
from PIL import Image
from torchvision import models, transforms
import matplotlib.pyplot as plt
import numpy as np
from torch.autograd import Variable
import operator
import torchvision.models as models

In [2]:
#get the pretrained VGG pretrained 
vgg16 = models.vgg16(pretrained=True)

#make vgg16 model layers as non trainable
for layer in vgg16.parameters():
    layer.requires_grad= False

features= list(vgg16.classifier.children())[:-1] #drop the last layer
#features.extend([nn.Linear(totalFilters,16)])    #add the linear layer
vgg16.classifier= nn.Sequential(*features)       #build the model


In [3]:
def collate(samples):
    #print(samples[1])
    img,label= zip(*samples)
    
    #sort sequences based on length
    seqLengths= [len(seq) for seq in label]
    maxSeqLength= max(seqLengths)
    sortedList= sorted(zip(list(img),label,seqLengths),key= lambda x:-x[2])
    img,label,seqLengths= zip(*sortedList)
    images = img[0].unsqueeze(0)
    for i in range(1,len(img)):
        images = torch.cat((images,img[i].unsqueeze(0)),dim=0)
    print(images.size())
    
    #create tensor with padded sequences
    paddedSeqs= torch.LongTensor(len(img),maxSeqLength)
    paddedSeqs.fill_(0)
    for (i,seq) in enumerate(label):
        paddedSeqs[i,:len(seq)]= seq
    return images,paddedSeqs,seqLengths
    

In [4]:
from torch.utils.data.dataset import Dataset, TensorDataset
class myDataset(Dataset):
    
    def __init__(self, pathToImages,pathToLabels,transform=None):
        self.pathToImages= pathToImages
        self.pathToLabels= pathToLabels
        imageTensor= []
        labelTensor= []
        pathToImages= self.pathToImages+'*jpg'
                
        #convert all the images to Tensor
        images = glob.glob("./JPEGImages/*jpg")
        for image in images:
            img= Image.open(image).convert('RGB')
            trans= transforms.Resize((224,224))
            trans1= transforms.ToTensor()
            tensorImg= trans1(trans(img))
            imageTensor.append(tensorImg)
                
        file = open(self.pathToLabels,"r")
        #convert all the labels to tensor
        for line in file:
            line = line.replace("\t"," ")
            line = line.replace("\n","")
            
            #line = line[11:] #get only from the label
            newLine = line.split()
            length = len(newLine)
            tensorLab= []
            
            for i in range(1,length):
                tensorLab.append(int(newLine[i]))
            [x+1 for x in tensorLab] #increment the label
            tensorLab.insert(0,1)    #add 1 for the sos
            tensorLab.append(22)     #add 22 for the eos
            tensorLab= np.asarray(tensorLab) #convert list to numpyArray
            tensorLab= torch.from_numpy(tensorLab) #convert numpyArray to tensor
            labelTensor.append(tensorLab)
        
        self.images= imageTensor
        self.labels= labelTensor
        self.transform= transform
        

        
    def __len__(self):
        #print("The total images in Dataset:__len__")
        return len(self.images)
    
    def __getitem__(self, index):
        img= self.images[index]
        label= self.labels[index]
        return img,label
                       
        
      
       

In [5]:
#calling the customize dataset class
myOwn= myDataset("./JPEGImages/","./data.txt")


In [6]:
#class to predict the class
class Network(torch.nn.Module):
    def __init__(self,inputSize,embeddingDim,hiddenSize,num_of_layer):
        super(Network,self).__init__()
        self.embedding= nn.Embedding(inputSize,embeddingDim,padding_idx=0)
        self.lstm= nn.LSTM(embeddingDim,hiddenSize,num_of_layer,batch_first=True)
        self.vggLinear = nn.Linear(4096,embeddingDim)
        self.linear1= nn.Linear(hiddenSize,embeddingDim)
        self.activation1 = nn.ReLU()
        self.FinalLinear= nn.Linear(embeddingDim,23)
        self.activation2= nn.LogSoftmax(dim = -1)
    
    def forward(self,inputs,h_0,c_0,vgginput,originaLength):
#         embeddding = (self.embedding(inputs)).view(1,-1)
       
        embedding= self.embedding(inputs)
#        embedding = embedding.permute(1,0,2) #permute the batch size index
        packed= torch.nn.utils.rnn.pack_padded_sequence(embedding,originaLength, batch_first=True)
        batch_size = packed[1]
        output,_ = self.lstm(packed,(h_0,c_0))
        unpack,_ = torch.nn.utils.rnn.pad_packed_sequence(output,batch_first=True)
        print(unpack.size())
        unpack = unpack.permute(1,0,2)
        probs = []
        for i in range(len(inputs[1])):
            original_data = unpack[i,:batch_size[i],:]
            original_activations = vgginput[:batch_size[i],:]
            out = self.vggLinear(original_activations) + self.linear1(original_data)         
            output = self.activation1(out)
            output = self.FinalLinear(output)
            output = self.activation2(output)
            probs.append(output)
        return probs,batch_size
        
        
        

In [7]:
#model parameteres
vocab_size= 23
embedding_Dim= 16
hiddenNodes= 512
numLayers= 1
model = Network(vocab_size,embedding_Dim,hiddenNodes,numLayers)
lossCriterion= nn.NLLLoss()
optimizer= torch.optim.Adam(model.parameters(),lr= 0.01)

In [8]:
trainLoader= torch.utils.data.DataLoader(myOwn,batch_size=32,shuffle=True,num_workers=0,collate_fn=collate)

In [9]:
#initalize the h_0 and c_0
batchSize= 32

h_0= torch.zeros(1,batchSize,hiddenNodes) 
c_0= torch.zeros(1,batchSize,hiddenNodes)


In [None]:
for epoch in range(1):
    for index,(img,paddedSeqs,seqLengths) in (enumerate(trainLoader)):
        seqLengths= list(map(int,seqLengths))
        
        cnnOut = vgg16(img)
        
        model.zero_grad()
        
        loss= 0
        
        #forward propagation 
        logProbs,batch_size= model.forward(paddedSeqs,h_0,c_0,cnnOut,seqLengths)
        
        for i in range(batch_size.size(0)):
            loss += lossCriterion(logProbs[i],paddedSeqs[:batch_size[i].item(),i])
        #backward propagation
        loss.backward()
        nn.utils.clip_grad_value_(model.parameters(),10)
        
        print('batch loss {}'.format(loss.cpu().item()/batch_size.size(0)))
        #update
        optimizer.step()
        
        #update
        optimizer.step()


        

torch.Size([32, 3, 224, 224])
torch.Size([32, 10, 512])
batch loss 3.141581153869629
torch.Size([32, 3, 224, 224])
torch.Size([32, 10, 512])
batch loss 8.66195068359375
torch.Size([32, 3, 224, 224])
torch.Size([32, 15, 512])
batch loss 4.996309916178386
torch.Size([32, 3, 224, 224])
torch.Size([32, 13, 512])
batch loss 3.9961770864633412
torch.Size([32, 3, 224, 224])
torch.Size([32, 17, 512])
batch loss 3.077191072351792
torch.Size([32, 3, 224, 224])
torch.Size([32, 17, 512])
batch loss 3.161558712230009
torch.Size([32, 3, 224, 224])
torch.Size([32, 15, 512])
batch loss 3.0747754414876303
torch.Size([32, 3, 224, 224])
torch.Size([32, 11, 512])
batch loss 3.100066445090554
torch.Size([32, 3, 224, 224])
torch.Size([32, 11, 512])
batch loss 3.097578915682706
torch.Size([32, 3, 224, 224])
torch.Size([32, 17, 512])
batch loss 3.067233814912684
torch.Size([32, 3, 224, 224])
torch.Size([32, 28, 512])
batch loss 2.977923257010324
torch.Size([32, 3, 224, 224])
torch.Size([32, 14, 512])
batch lo