In [26]:
%cd /content/drive/MyDrive/CV project/

/content/drive/MyDrive/CV project


In [None]:
#importing libraries
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image
from pathlib import Path
import yaml
import torchvision.transforms as T
from pathlib import Path

device = 'cuda' if torch.cuda.is_available() else 'cpu'

PROJECT_DIR = Path('/content')
# ---------------------------------------------
# 4. Detect dataset folder
# ---------------------------------------------
YOLO_DATA = None
for p in PROJECT_DIR.rglob('dataset'):
    if (p / 'images').is_dir() and (p / 'labels').is_dir() and (p / 'data.yaml').is_file():
        YOLO_DATA = p
        break
if YOLO_DATA is None:
    raise FileNotFoundError(f"Could not locate dataset under {PROJECT_DIR}")
print(f"Dataset folder: {YOLO_DATA}")
IMAGES_DIR = YOLO_DATA / 'images'
LABELS_DIR = YOLO_DATA / 'labels'

Dataset folder: /content/drive/MyDrive/CV project/dataset


In [None]:
#defining transformations
transforms =  T.Compose([
    T.Resize((640, 640)),
    T.ColorJitter(brightness=0.4, saturation=0.7, hue=0.015),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(degrees=10),
    T.RandomAffine(
        degrees=0,
        translate=(0.1, 0.1),
        scale=(0.7, 1.3)
    ),
    T.ToTensor()
])

In [None]:
#Using a converter class to convert the dataset from YOLO format to FRCNN format
class YoloToFRCNNDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, class_names, transform=None):
        self.image_dir = Path(image_dir)
        self.transform = transform
        self.class_names = class_names
        self.images = list(self.image_dir.glob("*.jpg"))
        if not self.images:  # If no .jpg images are found
          self.images = list(self.image_dir.glob("*.png"))

    def __getitem__(self, idx):
        img_path = self.images[idx]
        txt_path = Path(str(img_path).replace('images/', 'labels/')).with_suffix(".txt")

        img = Image.open(img_path).convert("RGB")
        w, h = img.size

        boxes = []
        labels = []
        with open(txt_path, 'r') as f:
            for line in f:
                cls_id, x_c, y_c, w_box, h_box = map(float, line.strip().split())
                x1 = (x_c - w_box / 2) * w
                y1 = (y_c - h_box / 2) * h
                x2 = (x_c + w_box / 2) * w
                y2 = (y_c + h_box / 2) * h
                boxes.append([x1, y1, x2, y2])
                labels.append(int(cls_id))  # +1 to offset background

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        target = {'boxes': boxes, 'labels': labels}

        if self.transform:
            img = self.transform(img)

        return img, target

    def __len__(self):
        return len(self.images)

In [None]:
#function to parse the yaml file
def parse_yaml(yaml_path):
    with open(yaml_path, 'r') as f:
        data = yaml.safe_load(f)

    train_img_dir = Path(YOLO_DATA / data['train'])
    val_img_dir = Path(YOLO_DATA / data['val'])
    test_img_dir = Path(YOLO_DATA / data['test'])
    class_names = data['names']
    num_classes = len(class_names)

    return train_img_dir, val_img_dir, test_img_dir, class_names, num_classes

In [None]:
#Initializing the model, optimizer and loss function
model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, 6)

# Optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:02<00:00, 81.7MB/s]


In [None]:
#Splittig the dataset into train, val and test
yaml_path = str(YOLO_DATA / 'data.yaml')
train_dir, val_dir, test_dir, class_names, num_classes = parse_yaml(yaml_path)

train_dataset = YoloToFRCNNDataset(train_dir, class_names, transform=transforms)
val_dataset = YoloToFRCNNDataset(val_dir, class_names, transform=transforms)
test_dataset = YoloToFRCNNDataset(test_dir, class_names, transform=transforms)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
#Training base model (just aerial images)
num_epochs = 50
losses_t, losses_v = [], []

for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_train_loss += losses.item()

    # Save average losses for the epoch
    avg_train_loss = epoch_train_loss / len(train_loader)
    losses_t.append(avg_train_loss)

    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

Epoch 1, Train Loss: 9.6978
Epoch 2, Train Loss: 9.1107
Epoch 3, Train Loss: 9.0796
Epoch 4, Train Loss: 8.8920
Epoch 5, Train Loss: 8.3850
Epoch 6, Train Loss: 8.6744
Epoch 7, Train Loss: 8.2145
Epoch 8, Train Loss: 8.1314
Epoch 9, Train Loss: 8.1135
Epoch 10, Train Loss: 8.0025
Epoch 11, Train Loss: 8.1219
Epoch 12, Train Loss: 8.0387
Epoch 13, Train Loss: 8.0334
Epoch 14, Train Loss: 8.2204
Epoch 15, Train Loss: 8.1426
Epoch 16, Train Loss: 7.9169
Epoch 17, Train Loss: 7.7161
Epoch 18, Train Loss: 7.7994
Epoch 19, Train Loss: 7.8939
Epoch 20, Train Loss: 7.8363
Epoch 21, Train Loss: 7.7522
Epoch 22, Train Loss: 7.8734
Epoch 23, Train Loss: 7.9252
Epoch 24, Train Loss: 7.8455
Epoch 25, Train Loss: 7.7524
Epoch 26, Train Loss: 7.7585
Epoch 27, Train Loss: 7.6260
Epoch 28, Train Loss: 7.8711
Epoch 29, Train Loss: 7.9087
Epoch 30, Train Loss: 7.9040
Epoch 31, Train Loss: 7.6460
Epoch 32, Train Loss: 7.7267
Epoch 33, Train Loss: 7.6989
Epoch 34, Train Loss: 7.7101
Epoch 35, Train Loss: 7

In [None]:
#Initializing model for depth maps
model = fasterrcnn_resnet50_fpn(pretrained=True)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, 6)

# Optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

yaml_path = str(YOLO_DATA / 'data_depth.yaml')
train_dir, val_dir, test_dir, class_names, num_classes = parse_yaml(yaml_path)

train_dataset = YoloToFRCNNDataset(train_dir, class_names, transform=transforms)
val_dataset = YoloToFRCNNDataset(val_dir, class_names, transform=transforms)
test_dataset = YoloToFRCNNDataset(test_dir, class_names, transform=transforms)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

model.to(device)



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
#Training depth map model
num_epochs = 50
d_losses_t, d_losses_v = [], []

for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        epoch_train_loss += losses.item()

    # Save average losses for the epoch
    avg_train_loss = epoch_train_loss / len(train_loader)
    d_losses_t.append(avg_train_loss)

    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

Epoch 1, Train Loss: 9.6718
Epoch 2, Train Loss: 8.9320
Epoch 3, Train Loss: 8.2426
Epoch 4, Train Loss: 8.5950
Epoch 5, Train Loss: 8.2581
Epoch 6, Train Loss: 8.4066
Epoch 7, Train Loss: 8.2478
Epoch 8, Train Loss: 8.0439
Epoch 9, Train Loss: 8.0475
Epoch 10, Train Loss: 8.0128
Epoch 11, Train Loss: 8.2037
Epoch 12, Train Loss: 8.0242
Epoch 13, Train Loss: 8.1445
Epoch 14, Train Loss: 8.2781
Epoch 15, Train Loss: 8.1037
Epoch 16, Train Loss: 8.1376
Epoch 17, Train Loss: 8.0207
Epoch 18, Train Loss: 8.0564
Epoch 19, Train Loss: 8.1636
Epoch 20, Train Loss: 7.8436
Epoch 21, Train Loss: 7.9494
Epoch 22, Train Loss: 8.0735
Epoch 23, Train Loss: 8.2293
Epoch 24, Train Loss: 7.9264
Epoch 25, Train Loss: 7.9673
Epoch 26, Train Loss: 8.0747
Epoch 27, Train Loss: 8.0379
Epoch 28, Train Loss: 7.8545
Epoch 29, Train Loss: 7.8844
Epoch 30, Train Loss: 8.0296
Epoch 31, Train Loss: 7.9845
Epoch 32, Train Loss: 7.9844
Epoch 33, Train Loss: 7.9367
Epoch 34, Train Loss: 8.0148
Epoch 35, Train Loss: 8

In [None]:
#loss isn't decreasing fast enough
#it is taking around a day to train for 50 epochs
