In [60]:
from PIL import Image
import numpy as np
import pandas as pd
import torch.nn.functional as F
import torchvision.transforms as transforms


class nyu_dataset():
    def __init__(self,filename,type_of_data):
        self.file=pd.read_csv(filename)
        images_path=self.file["Images"]
        depths_path=self.file["Depth"]
        images_path_2=self.file["Images"]
        depths_path_2=self.file["Depth"]
        
        
        self.length=len(self.file)
        ratio=0.75
        training_set_size=int(ratio*self.length)
        
        if type_of_data=="train":
            self.images_path=list(images_path[0:training_set_size])                 #3/4 is train data
            self.depths_path=list(depths_path[0:training_set_size])
        elif type_of_data=="validation":
            self.images_path=list(images_path_2[training_set_size:])                  #1/4 is validation data
            self.depths_path=list(depths_path_2[training_set_size:])
        
    def __len__(self):
        return len(self.images_path)
    
    def __getitem__(self,idx):
        # print(self.images_path)
        image_path=self.images_path[idx]
        depth_path=self.depths_path[idx]
        image=Image.open(image_path)
        depth=Image.open(depth_path)
        
        transform_image = transforms.Compose([
        transforms.Resize((228, 304)),
        transforms.ToTensor(),
        ])
        
        transform_depth = transforms.Compose([
        transforms.Resize((55, 74)),
        transforms.ToTensor(),
        ])
        
        
        image_tensor=transform_image(image)
        depth_tensor=transform_depth(depth)
        depth_tensor = depth_tensor/1000-1
        
        item={'Image':image_tensor,'Depth':depth_tensor}
        
        return item

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision.transforms as transforms

output_height=55
output_width=74
epoch=10

total_number_of_pixels=output_height*output_width


In [79]:
size_of_batch=8
#training using test data set because training dataset has large data and takes long time to train
#just change test to train to use training dataset
training_data_loading=torch.utils.data.DataLoader(nyu_dataset("nyu2_test.csv","train"),batch_size=size_of_batch)
validation_data_loading=torch.utils.data.DataLoader(nyu_dataset("nyu2_test.csv","validation"),batch_size=size_of_batch)

In [80]:
import torch.nn as nn
import torch.nn.functional as F

class coarse_network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.c1=nn.Conv2d(in_channels=3,out_channels=96,kernel_size=(11,11),stride=4)
        self.c2=nn.Conv2d(in_channels=96,out_channels=256,kernel_size=(5,5))
        self.c3=nn.Conv2d(in_channels=256,out_channels=384,kernel_size=(3,3))
        self.c4=nn.Conv2d(in_channels=384,out_channels=384,kernel_size=(3,3))
        self.c5=nn.Conv2d(in_channels=384,out_channels=256,kernel_size=(3,3))   
        self.fc1=nn.Linear(12800,4096)
        self.fc2=nn.Linear(4096,4070)   #output should be 74*55=4070
        self.pool=nn.MaxPool2d(2)
        self.dropout=nn.Dropout() 
        self._init_weights(coarse_network)
    
    
    def _init_weights(self,module):                    #https://wandb.ai/wandb_fc/tips/reports/How-to-Initialize-Weights-in-PyTorch--VmlldzoxNjcwOTg1#:~:text=One%20of%20the%20most%20popular,in%20a%20custom%20PyTorch%20model.&text=This%20code%20snippet%20initializes%20all,all%20the%20biases%20to%20zero.
            if isinstance(module,nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                module.bias.data.fill(0.001)
            elif isinstance(module,nn.Conv2d):
                module.weight.data,normal(0,0.01)   #Because in imagenet classification it is used(mentioned in research paper) uses this initilization
                module.bias.data.zero()

    def forward(self,x):
        #all layers are relu activated except layer 7 which is linear and there is a dropout after layer 6(mentioned in research paper)
        x=F.relu(self.c1(x))
        x=self.pool(x)
        x=F.relu(self.c2(x))
        x=self.pool(x)
        x=F.relu(self.c3(x))
        x=F.relu(self.c4(x))
        x=F.relu(self.c5(x))
        x=x.view(x.size(0),-1)  #we the next layer is a fully connected layer
        x=F.relu(self.fc1(x))   #default value of p=0.5 (Bernouli distribution), kind of regularization
        x=self.fc2(x)     
        x=self.dropout(x)         #this layer have linear acativation (identity function)
        x=x.view(-1,1,55,74)
        return x
        
class fine_network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.c1=nn.Conv2d(in_channels=3,out_channels=63,kernel_size=(9,9),stride=2)
        self.c2=nn.Conv2d(in_channels=64,out_channels=64,kernel_size=(5,5),padding=2)
        self.c3=nn.Conv2d(in_channels=64,out_channels=1,kernel_size=(5,5),padding=2)
        self.pool=nn.MaxPool2d(2)
        self._init_weights(fine_network)   
    
    def _init_weights(self,module):                    #https://wandb.ai/wandb_fc/tips/reports/How-to-Initialize-Weights-in-PyTorch--VmlldzoxNjcwOTg1#:~:text=One%20of%20the%20most%20popular,in%20a%20custom%20PyTorch%20model.&text=This%20code%20snippet%20initializes%20all,all%20the%20biases%20to%20zero.
            if isinstance(module,nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                module.bias.data.fill(0.001)
            elif isinstance(module,nn.Conv2d):
                module.weight.data,normal(0,0.01)   #Because in imagenet classification it is used(mentioned in research paper) uses this initilization
                module.bias.data.zero()
            
    def forward(self,x,y):
        x=F.relu(self.c1(x))
        x=self.pool(x)
        x=torch.cat((x,y),1)
        x=F.relu(self.c2(x))
        x=self.c3(x)
        return x
                

In [81]:
def training_loss_function(pred,actual):
#     di=torch.log(pred)-torch.log(actual), log values are giving some undefined values so used without log and normalized the depth map
    di=pred-actual
    di_square=torch.pow(di,2)
#     print(di)
#     print(di_square.shape)
    n=total_number_of_pixels
    loss=(torch.sum(di_square,(1,2,3))/n)-(0.5)*(torch.pow(torch.sum(di,(1,2,3)),2)/(n*n))
#     print(loss)
    return loss.mean()

In [82]:
def scale_invariant_loss_function(pred,actual):  #same as training loss except the factor lambda=0.5
#     di=torch.log(pred)-torch.log(actual)
    di=pred-actual
    di_square=torch.pow(di,2)
    n=total_number_of_pixels
    loss=(torch.sum(di_square,(1,2,3))/n)-(torch.pow(torch.sum(di,(1,2,3)),2)/(n*n))
    return loss.mean()

In [83]:
#optimizers
coarse_model=coarse_network()
fine_model=fine_network()
coarse_network_optimizer=optim.Adam(coarse_model.parameters(),lr=0.001, betas=(0.9, 0.999), eps=1e-08)  #from official documentation of adam optimizer
fine_network_optimizer=optim.Adam(fine_model.parameters(),lr=0.001, betas=(0.9, 0.999), eps=1e-08)

In [67]:
def training_coarse_network(coarse_model,training_data_loading,coarse_network_optimizer):
    coarse_model.train()
    training_coarse_loss=0
    for i, batch in enumerate(training_data_loading):
        image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=True)
        depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=True)
        coarse_network_optimizer.zero_grad()             #Sets the gradients of all optimized torch.Tensors to zero
        pred=coarse_network().forward(image)
        actual=depth
        loss=training_loss_function(pred,actual)
        loss.backward()
        coarse_network_optimizer.step()
        training_coarse_loss+=loss.item()                #we need as a standard python number of item is used
    training_coarse_loss=(training_coarse_loss/(i+1))        #taking avg over batches
    return training_coarse_loss
        
    

In [68]:
def training_fine_network(coarse_model,fine_model,training_data_loading,fine_network_optimizer):
    fine_model.train()
    coarse_model.eval()
    training_fine_loss=0
    for i, batch in enumerate(training_data_loading):
        image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=True)
        depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=True)
        fine_network_optimizer.zero_grad()            #Sets the gradients of all optimized torch.Tensors to zero
        coarse_network_output=coarse_network().forward(image)
        pred=fine_network().forward(image,coarse_network_output)
        actual=depth
        loss=training_loss_function(pred,actual)
        loss.backward()
        fine_network_optimizer.step()
        training_fine_loss+=loss.item()                 #we need as a standard python number of item is used
    training_fine_loss=(training_fine_loss/(i+1))           #taking avg over batches
    return training_fine_loss

In [69]:
def coarse_network_validation(coarse_model,validation_data_loading):
    coarse_model.eval()
    coarse_validation_loss=0
    scale_invariant_loss=0
    for i, batch in enumerate(validation_data_loading):
        image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=False)
        depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=False)
        pred=coarse_network().forward(image)
        actual=depth
        coarse_validation_loss+=training_loss_function(pred,actual).item()
        scale_invariant_loss+=scale_invariant_loss_function(pred,actual).item()
    coarse_validation_loss=(coarse_validation_loss/(i+1))
    scale_invariant_loss=(scale_invariant_loss/(i+1))
    loss=[coarse_validation_loss,scale_invariant_loss]
    return loss
    

In [84]:
def fine_network_validation(fine_model,validation_data_loading):
    fine_model.eval()
    fine_validation_loss=0
    scale_invariant_loss=0
    for i, batch in enumerate(validation_data_loading):
        image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=False)
        depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=False)
        coarse_network_output=coarse_network().forward(image)
        pred=fine_network().forward(image,coarse_network_output)
        actual=depth
        fine_validation_loss+=training_loss_function(pred,actual).item()
        scale_invariant_loss+=scale_invariant_loss_function(pred,actual).item()
    fine_validation_loss=(fine_validation_loss/(i+1))
    scale_invariant_loss=(scale_invariant_loss/(i+1))
    loss=[fine_validation_loss,scale_invariant_loss]
    return loss
    

In [85]:
coarse_losses_after_each_epoch={}                 # {training loss: [validation loss,scale_invariant_loss]}
for i in range(epoch):
    coarse_training_loss=training_coarse_network(coarse_model,training_data_loading,coarse_network_optimizer)
    coarse_validation_loss=coarse_network_validation(coarse_model,validation_data_loading)
    coarse_losses_after_each_epoch[i]=[coarse_training_loss,coarse_validation_loss]
print(coarse_losses_after_each_epoch)

  image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=True)
  depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=True)
  depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=False)


{0: [2.634761837220961, [4.098639153298878, 1.6470668741634913]], 1: [2.634897409908233, [4.098531172389076, 1.6469523466768718]], 2: [2.634840085621803, [4.098591861270723, 1.6470208494436174]], 3: [2.6348747236113392, [4.09861637864794, 1.6470301236425127]], 4: [2.634872505741735, [4.098510305086772, 1.6470812672660464]], 5: [2.6347786370785005, [4.098704570815677, 1.647116822855813]], 6: [2.6348652248421023, [4.098535770461673, 1.6470766039121718]], 7: [2.634898086228678, [4.098387774967012, 1.6469320371037437]], 8: [2.6349206326469297, [4.0986424798057195, 1.6470782274291629]], 9: [2.6348916638282036, [4.098745698020572, 1.6470425497917902]]}


In [86]:
fine_losses_after_each_epoch={}       # {training loss: [validation loss,scale_invariant_loss]}
for i in range(epoch):
    fine_training_loss=training_fine_network(coarse_model,fine_model,training_data_loading,fine_network_optimizer)
    fine_validation_loss=fine_network_validation(fine_model,validation_data_loading)
    fine_losses_after_each_epoch[i]=[fine_training_loss,fine_validation_loss]
print(fine_losses_after_each_epoch)          #we can draw graph for each epoch values

  image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=True)
  depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=True)


{0: [2.6317163369347973, [4.079009061767941, 1.6469448393299466]], 1: [2.631667755303844, [4.113340911411104, 1.6480932505357833]], 2: [2.674490776273512, [4.114491797628856, 1.6497645747093928]], 3: [2.654037401560814, [4.120282184510004, 1.6492914387157984]], 4: [2.617448898092393, [4.096834443864369, 1.6468416580132075]], 5: [2.653712598066176, [4.061759239151364, 1.6438066987764268]], 6: [2.650643456847437, [4.088529785474141, 1.644479391120729]], 7: [2.619757235530884, [4.075116464069912, 1.6445517979917073]], 8: [2.635171913331555, [4.130638185001555, 1.6519657671451569]], 9: [2.6428898558501275, [4.056015281450181, 1.641001162074861]]}


In [87]:
#we can use this code to see the predicition image, here i am using validation set only
for i, batch in enumerate(validation_data_loading):
    image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=False)
    depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=False)
    coarse_network_output=coarse_network().forward(image)
    pred=fine_network().forward(image,coarse_network_output)
    tensor=pred[1]
transform = transforms.ToPILImage()
img=transform(tensor)
img.show()

  image=torch.tensor(batch['Image'].type(torch.FloatTensor),requires_grad=False)
  depth=torch.tensor(batch['Depth'].type(torch.FloatTensor),requires_grad=False)
