# Depth Estimation

## Connect with Google Drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

## Parameters

In [None]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 16
ANNOTATIONS_FILE_PATH = '/content/gdrive/MyDrive/data/nyu_samples/nyu2_test.csv'
IMAGE_FOLDER_PATH = '/content/gdrive/MyDrive/data/nyu_samples/nyu2_test'
DATA_AUGMENTATION = True
MAX_EPOCHS = 10



## Dependencies


In [None]:
!pip install pytorch_lightning

In [None]:
import os
import math
import pandas as pd
import cv2 as cv
import copy as cp
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.models import efficientnet_b0
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import loggers as pl_loggers

## Creating Dataset

In [None]:
class NYUDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert('RGB')
        mask_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 1])
        mask = Image.open(mask_path)

        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            mask = self.target_transform(mask)
        if isinstance(mask, torch.Tensor):
            mask = mask.to(torch.float32)
        else:
            mask = mask.asarray("float32")
        return image, mask


In [None]:
def visualize_depth_map(data, index=None):
  if index:
    imgs = data[index]
    img, depth = visualize_img(imgs)
    figure, axs = plt.subplots(1, 2, figsize=(10,5))
    figure.suptitle(f'Image with depth map, index {index}')
    axs[0].set_title("Base image")
    axs[0].imshow(img)
    axs[1].set_title("Depth Map")
    axs[1].imshow(depth)
  else:
    print("Show 20 first image pairs")
    figure, axs = plt.subplots(20, 2, figsize=(10,20*5))
    for i, imgs in enumerate(data):
      if i == 20:
        break
      img, depth = visualize_img(imgs)
      axs[i, 0].set_title(f"Base image with index: {i}")
      axs[i, 0].imshow(img)
      axs[i, 1].set_title("Depth Map")
      axs[i, 1].imshow(depth)
      
def visualize_img(imgs):
  imgs = cp.deepcopy(imgs)
  img, depth = imgs
  img = img.detach().cpu().numpy()
  r = img[0]
  g = img[1]
  b = img[2]
  img = (cv.merge([r, g, b])+1)*125
  img = img.astype("uint8")
  return img, depth

In [None]:
if DATA_AUGMENTATION:
  trans = transforms.Compose(
      [
          transforms.ToTensor(),
          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
          transforms.Resize((224, 224))
      ])
else:
  trans = transforms.Compose(
      [
          transforms.ToTensor(),
      ])
def visual_mask_fn(x):
  x = np.array(x) / 10000
  x = cv.resize(x, (56, 56))
  return torch.tensor(x)
  

## BiFPN (Decoder for task depth estimation)

In [None]:
class BiFPN(nn.Module):
    def __init__(self,  fpn_sizes):
        super(BiFPN, self).__init__()
        
        P4_channels, P5_channels, P6_channels = fpn_sizes
        self.W_bifpn = 64

        #self.p6_td_conv  = nn.Conv2d(P6_channels, self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p6_td_conv  = nn.Conv2d(P6_channels, self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p6_td_conv_2  = nn.Conv2d(self.W_bifpn, self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p6_td_act   = nn.ReLU()
        self.p6_td_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p6_td_w1    = torch.tensor(1, dtype=torch.float, requires_grad=True)

        self.p5_td_conv  = nn.Conv2d(P5_channels,self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p5_td_conv_2  = nn.Conv2d(self.W_bifpn,self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p5_td_act   = nn.ReLU()
        self.p5_td_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p5_td_w1    = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p5_td_w2    = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p6_upsample  = nn.Upsample(scale_factor=2, mode='nearest')

        self.p4_td_conv  = nn.Conv2d(P4_channels, self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p4_td_conv_2  = nn.Conv2d(self.W_bifpn, self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p4_td_act   = nn.ReLU()
        self.p4_td_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p4_td_w1    = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p4_td_w2    = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p5_upsample   = nn.Upsample(scale_factor=2, mode='nearest')



        #self.p4_out_conv = nn.Conv2d(P4_channels, self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p4_out_conv = nn.Conv2d(self.W_bifpn, self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p4_out_act   = nn.ReLU()
        self.p4_out_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p4_out_w1   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p4_out_w2   = torch.tensor(1, dtype=torch.float, requires_grad=True)

        #self.p5_out_conv = nn.Conv2d(P5_channels,self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p5_out_conv = nn.Conv2d(self.W_bifpn,self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p5_out_act   = nn.ReLU()
        self.p5_out_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p5_out_w1   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p5_out_w2   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p5_out_w3   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p4_downsample= nn.MaxPool2d(kernel_size=2)

        #self.p6_out_conv = nn.Conv2d(P6_channels, self.W_bifpn, kernel_size=3, stride=1, bias=True, padding=1)
        self.p6_out_conv = nn.Conv2d(self.W_bifpn, self.W_bifpn, kernel_size=3, stride=1, groups=self.W_bifpn, bias=True, padding=1)
        self.p6_out_act   = nn.ReLU()
        self.p6_out_conv_bn = nn.BatchNorm2d(self.W_bifpn)
        self.p6_out_w1   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p6_out_w2   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p6_out_w3   = torch.tensor(1, dtype=torch.float, requires_grad=True)
        self.p5_downsample= nn.MaxPool2d(kernel_size=2)



    def forward(self, inputs):
        epsilon = 0.0001
        P4, P5, P6 = inputs

        P6_td_inp = self.p6_td_conv(P6)
        P6_td = self.p6_td_conv_2((self.p6_td_w1 * P6_td_inp) / (self.p6_td_w1 + epsilon))
        P6_td = self.p6_td_act(P6_td)
        P6_td = self.p6_td_conv_bn(P6_td)

         
        P5_td_inp = self.p5_td_conv(P5)
        P5_td = self.p5_td_conv_2((self.p5_td_w1 * P5_td_inp + self.p5_td_w2 * self.p6_upsample(P6_td)) /
                                 (self.p5_td_w1 + self.p5_td_w2 + epsilon))
        P5_td = self.p5_td_act(P5_td)
        P5_td = self.p5_td_conv_bn(P5_td)

        P4_td_inp = self.p4_td_conv(P4)
        P4_td = self.p4_td_conv_2((self.p4_td_w1 * P4_td_inp + self.p4_td_w2 * self.p5_upsample(P5_td)) /
                                 (self.p4_td_w1 + self.p4_td_w2 + epsilon))
        P4_td = self.p4_td_act(P4_td)
        P4_td = self.p4_td_conv_bn(P4_td)



        P4_out = self.p4_out_conv((self.p4_out_w1 * P4_td_inp  + self.p4_out_w2 * P4_td)
                                    / (self.p4_out_w1 + self.p4_out_w2 + epsilon))
        P4_out = self.p4_out_act(P4_out)
        P4_out = self.p4_out_conv_bn(P4_out)

        
        P5_out = self.p5_out_conv(( self.p5_out_w1 * P5_td_inp + self.p5_out_w2 * P5_td + self.p5_out_w3 * self.p4_downsample(P4_out) )
                                    / (self.p5_out_w2 + self.p5_out_w3 + epsilon))
        P5_out = self.p5_out_act(P5_out)
        P5_out = self.p5_out_conv_bn(P5_out)

        
        P6_out = self.p6_out_conv((self.p6_out_w1 * P6_td_inp + self.p6_out_w2 * P6_td + self.p6_out_w3 * self.p5_downsample(P5_out) )
                                    / (self.p6_out_w1 + self.p6_out_w2 + self.p6_out_w3 + epsilon))
        P6_out = self.p6_out_act(P6_out)
        P6_out = self.p6_out_conv_bn(P6_out)

        

        return [P4_out, P5_out, P6_out]

## EfficientNet (encoder for task depth estimation)



In [None]:
class EfficientNet(nn.Module):
  def __init__(self):
    super(EfficientNet, self).__init__()
    encoder = efficientnet_b0(pretrained=True)
    features = encoder.features
    num_of_sequences = len(features)
    self.layer1 = features[:num_of_sequences-6]
    self.layer2 = features[num_of_sequences-6]
    self.layer3 = features[num_of_sequences-5]

  def get_features(self, x):
    x1 = self.layer1(x)
    x2 = self.layer2(x1)
    x3 = self.layer3(x2)
    return x1,x2,x3

## Custiom Model (Based on EfficientDet)

In [None]:
class CustomEfficientDet(nn.Module):
  def __init__(self):
    super(CustomEfficientDet, self).__init__()
    self.encoder = EfficientNet()
    self.decoder_1 = BiFPN([24, 40, 80])
    self.decoder_2 = BiFPN([64, 64, 64])
    self.decoder_3 = BiFPN([64, 64, 64])
    self.upsample_4 = nn.Upsample(scale_factor=4, mode='nearest')
    self.upsample_2   = nn.Upsample(scale_factor=2, mode='nearest')
    self.final_convolution = nn.Conv2d(in_channels=64*3, out_channels=1, kernel_size=3, padding="same")

  def forward(self, x):
    p4, p5, p6 = self.encoder.get_features(x)
    out_dec_1 = self.decoder_1([p4, p5, p6])
    out_dec_2 = self.decoder_2(out_dec_1)
    out_dec_3 = self.decoder_3(out_dec_2)
    cat_out = self.concatenate_bifpn_features(out_dec_3)
    return self.final_convolution(cat_out)

  def concatenate_bifpn_features(self, out_bfpn3):
    p4 = out_bfpn3[0]
    p5 = out_bfpn3[1]
    p6 = out_bfpn3[2]
    p6_up = self.upsample_4(p6)
    p5_up = self.upsample_2(p5)
    return torch.cat([p4, p5_up, p6_up], dim=1)

## Create interface for training using lightning module

In [None]:
class LightningEfficientDet(pl.LightningModule):
  def __init__(self, batch_size=4, learning_rate=1e-3):
    super(LightningEfficientDet, self).__init__()
    self.batch_size = batch_size
    self.efficient_det = CustomEfficientDet()
    self.loss = nn.MSELoss()
    self.validation_step_losses = []
    self.learning_rate = learning_rate

  def forward(self, x):
    return self.efficient_det(x)

  def training_step(self, train_batch, batch_idx):
    x, y = train_batch
    batch_out = self.forward(x)
    loss = self.loss(batch_out, y)
    logs = {'train_loss': loss}
    return {'loss': loss, 'log': logs}

  def validation_step(self, val_batch, batch_idx):
    x, y = val_batch
    batch_out = self.forward(x)
    loss = self.loss(batch_out, y)
    logs = {'train_loss': loss}
    self.validation_step_losses.append(loss)
    return {'batch_val_loss': loss}

  def on_validation_epoch_end(self) -> None:
    avg_loss = torch.stack(self.validation_step_losses).mean()
    tensorboard_logs = {'val_loss': avg_loss}
    self.log("val_loss", avg_loss)

  def prepare_data(self):
    self.train_dataset = NYUDataset(annotations_file=ANNOTATIONS_FILE_PATH, img_dir=IMAGE_FOLDER_PATH, transform=trans, target_transform=visual_mask_fn)
    self.val_dataset = NYUDataset(annotations_file=ANNOTATIONS_FILE_PATH, img_dir=IMAGE_FOLDER_PATH, transform=trans, target_transform=visual_mask_fn)  # split datasets
    
  def train_dataloader(self):
     return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=2,shuffle=True)

  def val_dataloader(self):
     return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2,shuffle=True)

  def configure_optimizers(self):
     optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience = 5)
     return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}

## Training

In [None]:
early_stop_callback = pytorch_lightning.callbacks.EarlyStopping(
monitor='val_loss',
min_delta=0.0,
patience=10,
verbose=False,
mode='min'
)


checkpoint_callback = ModelCheckpoint(
    filename='train',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
)

tb_logger = pl_loggers.TensorBoardLogger('/content/gdrive/MyDrive/data/record', name='project')
model = LightningEfficientDet(batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE)
trainer = pl.Trainer(
    callbacks=[early_stop_callback, checkpoint_callback],  max_epochs=MAX_EPOCHS, 
  logger=tb_logger)

In [None]:
trainer.fit(model)

## Tests

In [None]:
%load_ext tensorboard
%tensorboard --logdir '/content/gdrive/MyDrive/data/record'

In [None]:
data = NYUDataset(annotations_file=ANNOTATIONS_FILE_PATH, img_dir=IMAGE_FOLDER_PATH, transform=trans, target_transform=visual_mask_fn)
dataloader =  DataLoader(data, batch_size=BATCH_SIZE, num_workers=2,shuffle=True)

In [None]:
img_test, mask = next(iter(dataloader))
out = model(img_test)[0][0]
out_img = out.detach().cpu().numpy()*1000

In [None]:
figure, axs = plt.subplots(1, 2, figsize=(10,5))
figure.suptitle(f'Image with depth map')
axs[0].set_title("Base image")
axs[0].imshow(img_test[0][0])
axs[1].set_title("Mask")
axs[1].imshow(out_img, cmap="gray")