In [1]:
!pwd

/content


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/

/content/drive/MyDrive


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
import os
import shutil
import glob
import argparse
import numpy as np
import random
import plotly
import plotly.figure_factory as ff
import torch
from torch.utils.data import Dataset, DataLoader
import torch.utils.data.distributed
from torchvision import transforms
import time
import numpy as np
from PIL import Image
import os
import random
import cv2
import matplotlib.pyplot as plt
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Data Preprocessing


In [5]:
#Taken inspiration from - Data processing for nyudepth dataset - referenced  https://arxiv.org/pdf/2210.09071.pdf for preprocessing
class NewDataLoader(torch.utils.data.Dataset):
  def __init__(self, args, mode,transform=None,filename=None):

    self.do_kb_crop = False
    self.input_height = 480
    self.input_width = 640
    self.do_random_rotate = True
    self.degree = 2.5
    self.transform = transform
    self.filenames = None

    filename = filename
    with open(filename, 'r') as f:
        self.filenames = f.readlines()

  def __len__(self):
        return len(self.filenames)

  def __getitem__(self,idx):

    if mode =='train':

      sample_path = self.filenames[idx]
      rgb_file = sample_path.split()[0]
      depth_file = sample_path.split()[1]

      image_path = rgb_file
      depth_path = depth_file
      image = Image.open(image_path)
      depth_gt = Image.open(depth_path)


      if self.input_height == 480:
        depth_gt = np.array(depth_gt)
        # print("Depth gt",depth_gt)
        valid_mask = np.zeros_like(depth_gt)
        valid_mask[45:472, 43:608] = 1
        depth_gt[valid_mask==0] = 0
        depth_gt = Image.fromarray(depth_gt)
      else:
        depth_gt = depth_gt.crop((43, 45, 608, 472))
        image = image.crop((43, 45, 608, 472))

      if self.do_random_rotate is True:
                random_angle = (random.random() - 0.5) * 2 * self.degree
                image = self.rotate_image(image, random_angle)
                depth_gt = self.rotate_image(depth_gt, random_angle, flag=Image.NEAREST)
                # depth_gt.show()
            
      image = np.asarray(image, dtype=np.float32) / 255.0
      depth_gt = np.asarray(depth_gt, dtype=np.float32)
      depth_gt = np.expand_dims(depth_gt, axis=2)

      depth_gt = depth_gt / 1000.0
      img, depth = image, depth_gt

      
      H, W = img.shape[0], img.shape[1]
      a, b, c, d = random.uniform(0,1), random.uniform(0,1), random.uniform(0,1), random.uniform(0,1)
      l, u = int(a*W), int(b*H)
      w, h = int(max((W-a*W)*c*0.75, 1)), int(max((H-b*H)*d*0.75, 1))
      depth_copied = np.repeat(depth, 3, axis=2)
      M = np.ones(img.shape)
      M[l:l+h, u:u+w, :] = 0
      img = M*img + (1-M)*depth_copied
      image = img.astype(np.float32)


      if image.shape[0] != self.input_height or image.shape[1] != self.input_width:
          image, depth_gt = self.random_crop(image, depth_gt, self.input_height, self.input_width)
      image, depth_gt = self.train_preprocess(image, depth_gt)
      sample = {'image': image, 'depth': depth_gt}

      if self.transform:
          sample = self.transform(sample)
      
      return sample

  def rotate_image(self, image, angle, flag=Image.BILINEAR):
        result = image.rotate(angle, resample=flag)
        return result

  def train_preprocess(self, image, depth_gt):
        # Random flipping
        do_flip = random.random()
        if do_flip > 0.5:
            image = (image[:, ::-1, :]).copy()
            depth_gt = (depth_gt[:, ::-1, :]).copy()
    
        # Random gamma, brightness, color augmentation
        do_augment = random.random()
        if do_augment > 0.5:
            image = self.augment_image(image)
    
        return image, depth_gt
  def augment_image(self, image):
        # gamma augmentation
        gamma = random.uniform(0.9, 1.1)
        image_aug = image ** gamma

        brightness = random.uniform(0.75, 1.25)
        image_aug = image_aug * brightness
        colors = np.random.uniform(0.9, 1.1, size=3)
        white = np.ones((image.shape[0], image.shape[1]))
        color_image = np.stack([white * colors[i] for i in range(3)], axis=2)
        image_aug *= color_image
        image_aug = np.clip(image_aug, 0, 1)

        return image_aug


In [6]:
class ToTensor(object):
    def __init__(self, mode):
        self.mode = mode
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    def to_tensor(self, pic):
        if isinstance(pic, np.ndarray):
            img = torch.from_numpy(pic.transpose((2, 0, 1)))
            return img

    def __call__(self, sample):
        image = sample['image']
        image = self.to_tensor(image)
        image = self.normalize(image)
        depth = sample['depth']
        if self.mode == 'train':
            depth = self.to_tensor(depth)
            return {'image': image, 'depth': depth}

Data Loading


In [7]:
from torch.utils.data import Dataset, DataLoader
def preprocessing_transforms(mode):
    return transforms.Compose([
        ToTensor(mode=mode)
    ])
mode = 'train'
batch_size = 512
train_sampler = None
train_file = 'train_values.txt'
test_file = 'test_values.txt'
training_samples = NewDataLoader([], mode, transform=preprocessing_transforms(mode),filename=train_file)
testing_samples = NewDataLoader([], mode, transform=preprocessing_transforms(mode),filename=test_file)
train_data = DataLoader(training_samples, batch_size,
                                   shuffle=(train_sampler is None),
                                   num_workers=1,
                                   pin_memory=True,
                                   sampler=train_sampler)

Loss Implementations


In [8]:
#used to calculate the silog loss
class silog_loss(nn.Module):
    def __init__(self, vf):
        super(silog_loss, self).__init__()
        self.vf = vf #variance_focus

    def forward(self, depth_est, depth_gt,mask):
        d = torch.log(depth_est[mask]) - torch.log(depth_gt[mask])
        return torch.sqrt((d ** 2).mean() - self.vf * (d.mean() ** 2)) * 10.0


In [9]:
#used to calculate the huber loss
class HuberLoss(nn.Module):
    def __init__(self, delta=1.0):
        super(HuberLoss, self).__init__()
        self.delta = delta

    def forward(self, y_pred, y_true):
        residual = torch.abs(y_true - y_pred)
        condition = residual < self.delta
        loss = torch.where(condition, 0.5 * residual ** 2, self.delta * residual - 0.5 * self.delta ** 2)
        return torch.mean(loss)

In [10]:
#used to calculate the berlu loss
def berhu_loss(pred, target, threshold=0.2):
    diff = torch.abs(target - pred)
    delta = threshold * torch.max(target).item()
    mask = (diff < delta).float()
    loss = mask * (diff ** 2 / delta) 
    loss+= (1 - mask) * (diff - 0.5 * delta)
    return torch.mean(loss)

Error computations

In [11]:
def compute_errors(gt, pred):

    abs_diff = np.abs(gt - pred)
    # d1_error = abs_diff > (1.25)

    d1 = 0
    rms = (gt - pred) ** 2
    rms = np.sqrt(torch.mean(rms))
    log_rms = (np.log(gt) - np.log(pred)) ** 2
    log_rms = np.sqrt(torch.mean(log_rms))

    abs_rel = torch.mean(np.abs(gt - pred) / gt)
    sq_rel = torch.mean(((gt - pred) ** 2) / gt)
    
    err = np.log(pred) - np.log(gt)
    err = np.abs(np.log10(pred) - np.log10(gt))
    log10 = torch.mean(err)
    # print("RMS",log10)

    return [ abs_rel, log10, rms, sq_rel, log_rms, d1]

Defining the Base ResNet model

In [12]:
import torch.nn as nn
#used layer norm instead of batch norm due to batch_size =1 
class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResNetBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.LayerNorm(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.LayerNorm(out_channels)
        self.residual_change_conv  = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
        self.residual_change_ln =  nn.LayerNorm(out_channels)

        self.change_channels = nn.Conv2d(64, 3, kernel_size=1)

        self.conv_operation = nn.Conv2d(in_channels=3,out_channels=1,kernel_size=1).to(device)

    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = x.permute(0,2,3,1)

        x = self.bn1(x)
        x = x.permute(0,3,1,2)
        x = self.relu(x)
        x = self.conv2(x)
        x = x.permute(0,2,3,1)

        x = self.bn2(x)
        x = x.permute(0,3,1,2)
        residual = self.residual_change_conv(residual)
        residual = residual.permute(0,2,3,1)
        # print("Hi",x.shape)
        residual = self.residual_change_ln(residual)
        residual = residual.permute(0,3,1,2)
        x += residual
        x = self.relu(x)
        out = self.change_channels(x)
        out = self.conv_operation(out)
        return out


Training and Validation 

In [None]:
# Training parameters
variance_focus = 0.05
learning_rate = 0.01
num_epochs = 10
# batch_size = len(training_samples)

torch.manual_seed(42)
#declaring the resnet model
model = ResNetBlock(in_channels=3,out_channels=64).to(device)
conv_operation = nn.Conv2d(in_channels=3,out_channels=1,kernel_size=1).to(device)

num_params = sum([np.prod(p.size()) for p in model.parameters()])
print("== Total number of parameters: {}".format(num_params))
num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
print("== Total number of learning parameters: {}".format(num_params_update))

optimizer = torch.optim.Adam([{'params': model.parameters()}],lr=learning_rate)

#loss
silog_criterion = silog_loss(vf=variance_focus)
huber_loss = HuberLoss(delta=0.5)


#epoch runs
for epoch in range(num_epochs):
  checkpoint_path = "checkpoint_resnet"+str(epoch)+".pt"
  train_loss = 0
  #running for training_samples
  for train_val in training_samples:
    #putting model in train mode
    model.train()
    #initializing optimizer to zero
    optimizer.zero_grad()
    
    #converting image and target depth to torch variable
    image = torch.autograd.Variable(train_val['image']) #gpu
    depth_gt = torch.autograd.Variable(train_val['depth'])
    # print("DEPTHHHH",depth_gt.shape)

    #unsqueeze to get correct structure
    image = image.unsqueeze(0).to(device)
    depth_gt = depth_gt.unsqueeze(0).to(device)

    #passing through model for training
    resnet_image = model(image)
    depth_est = resnet_image

    #conv operation for channel updation
    # depth_est = conv_operation(depth_est)

    #done to remove 0 log error
    mask =  (depth_est>0.1) & (depth_gt>0.1) 
    
    #losses
    # l1 = silog_criterion.forward(depth_est,depth_gt,mask.to(torch.bool))
    l1 = huber_loss(depth_est[mask],depth_gt[mask])
    # l1 = berhu_loss(depth_est,depth_gt)

    #updating loss
    train_loss+=l1
    l1.backward()
    optimizer.step()
    # print("Train loss",l1)
  # model.load_state_dict(torch.load('resnet_just/checkpoint12.pt',map_location=torch.device('cpu')))
  #turning to eval mode
  val_loss = 0.0
  val_acc = 0.0
  # Iterate over the validation dataset

  #intialize the error values
  rae ,log10 , rms, sql_rel, log_rms, d1 = 0,0,0,0,0,0
  with torch.no_grad():
    for val in testing_samples:
        image = torch.autograd.Variable(val['image'])
        depth_gt = torch.autograd.Variable(val['depth']) #gpu
        image = image.unsqueeze(0).to(device)
        depth_gt = depth_gt.unsqueeze(0).to(device)
        resnet_image = model.eval()(image)
        depth_est = resnet_image
        # depth_est = conv_operation(resnet_image)
        mask =  (depth_est>0.1) & (depth_gt>0.1)
        # loss = silog_criterion.forward(depth_est, depth_gt,mask.to(torch.bool))
        loss = huber_loss(depth_est[mask],depth_gt[mask])
        # loss = berhu_loss(depth_est,depth_gt)
        # print(loss)
        val_loss += loss.item() 
        # Compute the errors
        erros_measures = compute_errors(depth_est[mask.to(torch.bool)].cpu(),depth_gt[mask.to(torch.bool)].cpu())
        rae += erros_measures[0]
        log10 += erros_measures[1]
        rms += erros_measures[2]
        # sql_rel += erros_measures[3]
        log_rms += erros_measures[4]
        # d1 += erros_measures[5]
    print("Average error measures for val dataset:")
    print("Absolute error {:.4f} Log10 {:.4f} RMS {:.4f} log_rms {:.4f}".format(rae/len(testing_samples),log10/len(testing_samples),rms/len(testing_samples),log_rms/len(testing_samples)))
  
  # Compute the average validation loss and accuracy
  val_loss /= len(testing_samples)
  train_loss /=len(training_samples)

  #save checkpoint for each epoch
  torch.save(model.state_dict(),checkpoint_path)
    
  # Print the epoch number, training loss, and validation loss and accuracy
  print("Epoch {}: train_loss {:.4f} val_loss {:.4f}".format(epoch+1, train_loss, val_loss))
    

Testing Code

In [17]:
# Training parameters
variance_focus = 0.05
learning_rate = 0.01
num_epochs = 20
# batch_size = len(training_samples)
torch.manual_seed(42)
#declaring the resnet model
model = ResNetBlock(in_channels=3,out_channels=64).to(device)
conv_operation = nn.Conv2d(in_channels=3,out_channels=1,kernel_size=1).to(device)

num_params = sum([np.prod(p.size()) for p in model.parameters()])
print("== Total number of parameters: {}".format(num_params))
num_params_update = sum([np.prod(p.shape) for p in model.parameters() if p.requires_grad])
print("== Total number of learning parameters: {}".format(num_params_update))

optimizer = torch.optim.Adam([{'params': model.parameters()}],lr=learning_rate)

#loss
silog_criterion = silog_loss(vf=variance_focus)
huber_loss = HuberLoss(delta=0.5)

model.load_state_dict(torch.load('checkpoint_resnet.pt',map_location=torch.device('cpu')))

for param in model.parameters():
    param.requires_grad = False

print(model)
model.eval()
val_loss = 0.0
rae ,log10 , rms, sql_rel, log_rms, d1 = 0,0,0,0,0,0
with torch.no_grad():
  for val in testing_samples:
      image = torch.autograd.Variable(val['image'])
      depth_gt = torch.autograd.Variable(val['depth']) #gpu
      image = image.unsqueeze(0).to(device)
      depth_gt = depth_gt.unsqueeze(0).to(device)
      # print(image.shape)
      resnet_image = model(image)
      depth_est = resnet_image
      # depth_est = conv_operation(resnet_image)
      mask =  (depth_est>0.1) & (depth_gt>0.1)
      # print(mask)
      # loss = silog_criterion.forward(depth_est, depth_gt,mask.to(torch.bool))
      loss = huber_loss(depth_est[mask],depth_gt[mask])
      # loss = berhu_loss(depth_est,depth_gt)
      val_loss += loss.item()
      # print(val_loss) 
      # Compute the errors
      erros_measures = compute_errors(depth_est[mask.to(torch.bool)].cpu(),depth_gt[mask.to(torch.bool)].cpu())
      rae += erros_measures[0]
      log10 += erros_measures[1]
      rms += erros_measures[2]
      # sql_rel += erros_measures[3]
      log_rms += erros_measures[4]
      # d1 += erros_measures[5]
  print("Average error measures for val dataset:")
  print("Absolute error {:.4f} Log10 {:.4f} RMS {:.4f} log_rms {:.4f}".format(rae/len(testing_samples),log10/len(testing_samples),rms/len(testing_samples),log_rms/len(testing_samples)))

# Compute the average validation loss and accuracy
val_loss /= len(testing_samples)
# train_loss /=len(training_samples)
  
# Print the epoch number, training loss, and validation loss and accuracy
print(" val_loss {:.4f}".format( val_loss))
  

== Total number of parameters: 39367
== Total number of learning parameters: 39367
ResNetBlock(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (relu): ReLU(inplace=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (residual_change_conv): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (residual_change_ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (change_channels): Conv2d(64, 3, kernel_size=(1, 1), stride=(1, 1))
  (conv_operation): Conv2d(3, 1, kernel_size=(1, 1), stride=(1, 1))
)
Average error measures for val dataset:
Absolute error 0.4183 Log10 0.1582 RMS 1.2006 log_rms 0.4339
 val_loss 0.3679
