# **RESTORING OCCLUDED OBJECTS: FROM DETECTION TO 3D RECONSTRUCTION**

****

# **AIM:**
To implement a comprehensive vision pipeline that:
- Detects objects using a pre-trained YOLO model.
- Segments objects using Segment Anything Model (SAM).
- Applies Super-Resolution to segmented masks using Real-ESRGAN.
- Performs Inpainting using an Autoencoder on the super-resolved masks.
- Enchances inpainted results again using Real-ESRGAN.

**Install necessary libraries and dependencies**

In [12]:
!pip install -qq ultralytics
!pip install -qq 'git+https://github.com/facebookresearch/segment-anything.git'
!pip install -qq jupyter_bbox_widget roboflow dataclasses-json supervision==0.23.0
!pip uninstall -qq setuptools -y && pip install -qq setuptools

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.8/978.8 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.5/151.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.7/220.7 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.9/774.9 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all t

**Setup Real-ESRGAN environment**

In [1]:
!pip uninstall -y -qq realesrgan
!git clone -qq https://github.com/xinntao/Real-ESRGAN.git
!pip install -qqq basicsr
!pip install -r /kaggle/working/Real-ESRGAN/requirements.txt
%cd Real-ESRGAN
!python /kaggle/working/Real-ESRGAN/setup.py develop
%cd /kaggle/working//
!pip install -qqq torchvision==0.12.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.2/256.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for basicsr (setup.py) ... [?25l[?25hdone
Collecting facexlib>=0.2.5 (from -r /kaggle/working/Real-ESRGAN/requirements.txt (line 2))
  Downloading facexlib-0.3.0-py3-none-any.whl.metadata (4.6 kB)
Collecting gfpgan>=1.3.5 (from -r /kaggle/working/Real-ESRGAN/requirements.txt (line 3))
  Downloading gfpga

# **Import Libraries**

In [13]:
import os
import torch
import cv2
import numpy as np
from PIL import Image
import shutil
from einops import rearrange
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **Load Pretrained Models**

**Load YOLOv11n Object Detection Model**

In [14]:
from ultralytics import YOLO
object_detection_model=YOLO("/kaggle/input/cs299-pre-trained-models/yolo11n_best.pt")

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


**Load Segment Anything Model (SAM)**

In [15]:
from segment_anything import sam_model_registry,SamPredictor
CHECKPOINT_PATH = os.path.join("/kaggle/input/cs299-pre-trained-models/sam_vit_h_4b8939.pth")
sam=sam_model_registry['vit_h'](checkpoint=CHECKPOINT_PATH).to(device=device)
segmentation_model=SamPredictor(sam)

**Load Autoencoder for Inpainting**

In [16]:
import torch.nn as nn
from torchvision import transforms
from einops import rearrange

class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 4, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(64, 128, 4, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(128, 256, 4, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(256, 512, 4, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(512, 512, 4, stride=2, padding=1), nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(512, 512, 4, stride=2, padding=1), nn.ReLU(),
            nn.ConvTranspose2d(512, 256, 4, stride=2, padding=1), nn.ReLU(),
            nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1), nn.ReLU(),
            nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1), nn.ReLU(),
            nn.ConvTranspose2d(64, 3, 4, stride=2, padding=1), nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))
        
inpainting_model=Autoencoder()
inpainting_model.load_state_dict(torch.load(
    "/kaggle/input/cs299-pre-trained-models/autoencoder_epoch_500.pth",
    map_location=device
))

inpainting_transformer=transforms.Compose([
    transforms.Resize((128,128)),
    transforms.ToTensor()
    
])

**Load Real-ESRGAN for super-resolution**

In [17]:
import torch
from basicsr.archs.rrdbnet_arch import RRDBNet

import sys
sys.path.append('/kaggle/working/Real-ESRGAN')

from realesrgan import RealESRGANer
from PIL import Image
import numpy as np

superresolution_model=RRDBNet(num_in_ch=3,num_out_ch=3,num_feat=64,num_block=23,num_grow_ch=32,scale=1000)

superresolution_upscaler=RealESRGANer(
    scale=10000,
    model_path='/kaggle/input/cs299-pre-trained-models/RealESRGAN_x4plus.pth',
    model=superresolution_model,
    tile=0,tile_pad=100,
    pre_pad=0,half=False
)

# **Define Processing Functions**

**Object Detection using YOLO**

In [18]:
def object_detection_function(model,image_path,detected,cropped,box_folder):
    results=model(image_path)[0]
    
    image=cv2.imread(image_path)
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    height,width,_=image.shape

    for i,box in enumerate(results.boxes):
        x1,y1,x2,y2=map(int,box.xyxy[0])
        conf=box.conf[0].item()
        new_image=image.copy()
        cv2.rectangle(new_image,(x1,y1),(x2,y2),(0,255,0),5)
        
        new_image=Image.fromarray(new_image)
        save_path=os.path.join(detected,f'{i+1}.jpg')
        new_image.save(save_path,format='JPEG')

        cropped_image=image[y1:y2,x1:x2]
        cropped_image=Image.fromarray(cropped_image)
        save_path=os.path.join(cropped,f'{i+1}.jpg')
        cropped_image.save(save_path,format='JPEG')

        bbox_filename=os.path.join(box_folder,f'{i+1}.txt')
        with open(bbox_filename,'w') as f:
            f.write(f"{x1} {y1} {x2} {y2}\n")

**Segmentation using SAM**

In [19]:
def segmentation_function(model,image_path,segmented,box_folder):
    image=cv2.imread(image_path)
    height,width,_=image.shape
    image=cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    model.set_image(image)
    
    for box_file in os.listdir(box_folder):
        box_file=os.path.join(box_folder,box_file)
        with open(box_file,'r') as f:
            line=f.readline().strip()
            x1,y1,x2,y2=map(int,line.split())

        box=np.array([x1,y1,x2,y2])
        masks,scores,logits = model.predict(
            box=box,
            multimask_output=True
        )
        mask=masks[0]
        for i in range(1,len(masks)):
            mask+=masks[i]
        # mask_image=Image.fromarray((mask*255).astype("uint8"))

        masked_image=np.zeros_like(image)
        masked_image[mask]=image[mask]
        masked_image=masked_image[y1:y2,x1:x2]
        segmented_object=Image.fromarray(masked_image)
        
        save_path=os.path.basename(box_file).replace('.txt','.png')
        save_path=os.path.join(segmented,save_path)
        segmented_object.save(save_path)

**Inpainting using Autoencoder**

In [20]:
def inpainting_function(model,segmented,inpainted,transformer):
    model.eval()
    for segmented_image in os.listdir(segmented):
        input_data=Image.open(os.path.join(segmented,segmented_image))
        transformed_image=transformer(input_data).unsqueeze(0)
    
        with torch.no_grad():
            output=model(transformed_image)
        output=output.squeeze(0).cpu().detach().numpy()
        output=rearrange(output,'c h w-> h w c')
        output=output.clip(0,1)
        output=(output*255).astype(np.uint8)
        output=Image.fromarray(output)
        save_path=os.path.basename(segmented_image)
        save_path=os.path.join(inpainted,save_path)
        output.save(save_path)

**Super-resolution using Real-ESRGAN**

In [21]:
def superresolution_function(upscaler,inpainted_folder,superresolution_folder):
    for inpainted_file in os.listdir(inpainted_folder):
        image=Image.open(os.path.join(inpainted_folder,inpainted_file))
        image=np.array(image)
        output,_=upscaler.enhance(image,outscale=10)
        save_path=os.path.basename(inpainted_file)
        save_path=os.path.join(superresolution_folder,save_path)
        Image.fromarray(output).save(save_path)

# **Run the fun pipeline on Test Samples**

**Define Input Image**

In [26]:
files=['/kaggle/input/dataset-image-label-visiblemask-occludedmask/images/train/2301.jpg']

In [27]:
for file in files:
    image_path=os.path.join(file)

    #Create output folders
    output_path=os.path.splitext(os.path.basename(image_path))[0]
    os.makedirs(os.path.join(output_path,"detected_object"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"cropped_object"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"segmented_object"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"inpainted_object"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"superresoluted_object_1"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"superresoluted_object_2"),exist_ok=True)
    os.makedirs(os.path.join(output_path,"bbox"),exist_ok=True)

    #Define folder paths
    detected_folder_path=os.path.join(output_path,"detected_object")
    cropped_folder_path=os.path.join(output_path,"cropped_object")
    segmented_folder_path=os.path.join(output_path,"segmented_object")
    inpainted_folder_path=os.path.join(output_path,"inpainted_object")
    superresolution_folder_path_1=os.path.join(output_path,"superresoluted_object_1")
    superresolution_folder_path_2=os.path.join(output_path,"superresoluted_object_2")
    bbox_folder_path=os.path.join(output_path,"bbox")

    #Run pipeline
    object_detection_function(object_detection_model,image_path,detected_folder_path,cropped_folder_path,bbox_folder_path)
    segmentation_function(segmentation_model,image_path,segmented_folder_path,bbox_folder_path)
    superresolution_function(superresolution_upscaler,segmented_folder_path,superresolution_folder_path_1)
    inpainting_function(inpainting_model,superresolution_folder_path_1,inpainted_folder_path,inpainting_transformer)
    superresolution_function(superresolution_upscaler,inpainted_folder_path,superresolution_folder_path_2)
    
    #Zip Results
    shutil.make_archive(output_path,"zip",os.path.join(output_path))


image 1/1 /kaggle/input/dataset-image-label-visiblemask-occludedmask/images/train/2301.jpg: 480x640 15 items, 16.4ms
Speed: 1.6ms preprocess, 16.4ms inference, 1.8ms postprocess per image at shape (1, 3, 480, 640)
