In this project, we will use the Resnet that we trained to annotate images.
The image annotations consist of bounding boxes which envelope a single object and labels for each object in the bounding box. Our training set is given by COCO 2014.

Our first task is to modify the the neural network that we trained in file 1 to be adapted for the much smaller list of classes in COCO. COCO consists of 80 categories:

person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic_light, fire_hydrant, stop_sign, parking_meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports_ball, kite, baseball_bat, baseball_glove, skateboard, surfboard, tennis_racket, bottle, wine_glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot_dog, pizza, donut, cake, chair, couch, potted_plant, bed, dining_table, toilet, tv, laptop, mouse, remote, keyboard, cell_phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy_bear, hair_drier, toothbrush

In order to facilitate the smaller class, set we we freeze the hidden layers of the network and learn the fully connected output layer.


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.downsample = downsample

    def forward(self, x):
        identity = x
        if self.downsample is not None:
            identity = self.downsample(x)

        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        out += identity
        out = F.relu(out)

        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_channels != out_channels * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, out_channels * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels * block.expansion),
            )

        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

def resnet18(num_classes=1000):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)
    

In [24]:
import subprocess
from pathlib import Path
import torch

# Get Git repository root dynamically
repo_root = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], text=True).strip()

# Define path relative to Git repo root
weights_path = Path(repo_root) / "resnet18_weights.pth"

#weights_path = '/home/ryan/Deep_Learning/resnet18_weights.pth'
print(f"Loading weights from: {weights_path}")

num_coco_classes = 80  # Adjust for your dataset
model = ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_coco_classes)

# Load the checkpoint, but ignore incompatible layers
checkpoint = torch.load(weights_path, map_location='cpu')

# Remove the fc layer weights from the loaded checkpoint
del checkpoint['fc.weight']
del checkpoint['fc.bias']

# Load the model ignoring the fc layer
model.load_state_dict(checkpoint, strict=False)

print("Model loaded, but fc layer was ignored due to size mismatch.")

# Define new fully connected layer with correct number of classes
model.fc = nn.Linear(model.fc.in_features, num_coco_classes)

# Move model to the appropriate device (GPU if available)
device = torch.device("cpu")
model = model.to(device)

print("Model loaded and transferred to:", device)


# Freeze all layers except the last block and fc
for name, param in model.named_parameters():
    if 'layer4' not in name and 'fc' not in name:
        param.requires_grad = False

for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Training: {name}")
    else:
        print(f"Frozen: {name}")

Loading weights from: /home/ryan/Deep_Learning/resnet18_weights.pth
Model loaded, but fc layer was ignored due to size mismatch.
Model loaded and transferred to: cpu
Frozen: conv1.weight
Frozen: bn1.weight
Frozen: bn1.bias
Frozen: layer1.0.conv1.weight
Frozen: layer1.0.bn1.weight
Frozen: layer1.0.bn1.bias
Frozen: layer1.0.conv2.weight
Frozen: layer1.0.bn2.weight
Frozen: layer1.0.bn2.bias
Frozen: layer1.1.conv1.weight
Frozen: layer1.1.bn1.weight
Frozen: layer1.1.bn1.bias
Frozen: layer1.1.conv2.weight
Frozen: layer1.1.bn2.weight
Frozen: layer1.1.bn2.bias
Frozen: layer2.0.conv1.weight
Frozen: layer2.0.bn1.weight
Frozen: layer2.0.bn1.bias
Frozen: layer2.0.conv2.weight
Frozen: layer2.0.bn2.weight
Frozen: layer2.0.bn2.bias
Frozen: layer2.0.downsample.0.weight
Frozen: layer2.0.downsample.1.weight
Frozen: layer2.0.downsample.1.bias
Frozen: layer2.1.conv1.weight
Frozen: layer2.1.bn1.weight
Frozen: layer2.1.bn1.bias
Frozen: layer2.1.conv2.weight
Frozen: layer2.1.bn2.weight
Frozen: layer2.1.bn2.b

  checkpoint = torch.load(weights_path, map_location='cpu')


In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CocoDetection
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pycocotools.coco import COCO
from torchvision.ops import box_iou
import os



# Path to the COCO dataset
coco_root = "/home/ryan/Documents/COCO/train2014"  # Update with your dataset path
coco_ann = "/home/ryan/Documents/COCO/annotations/instances_train2014.json"  # Update annotation path

# Define image transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the COCO dataset
full_dataset = CocoDetection(root=coco_root, annFile=coco_ann, transform=transform)

# Split dataset into 80% training and 20% validation
total_size = len(full_dataset)
train_size = int(0.8 * total_size)
val_size = total_size - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4)


# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Function to generate proposals using Selective Search
def get_selective_search_proposals(image_path):
    ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
    image = cv2.imread(image_path)
    ss.setBaseImage(image)
    ss.switchToSelectiveSearchQuality()  # Higher quality proposals
    rects = ss.process()
    return rects[:200]  # Limit number of proposals

# Function to compute IoU
def compute_iou(true_boxes, pred_boxes):
    true_boxes = torch.tensor(true_boxes, dtype=torch.float32)
    pred_boxes = torch.tensor(pred_boxes, dtype=torch.float32)
    return box_iou(true_boxes, pred_boxes)

# Function to validate the model
def validate_model():
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for img, target in val_loader:
            img = img.to(device)
            gt_labels = [ann['category_id'] for ann in target]
            outputs = model(img)
            labels = torch.tensor(gt_labels, dtype=torch.long).to(device)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    return val_loss / len(val_loader)

# Convert COCO bbox format [x, y, w, h] -> [x_min, y_min, x_max, y_max]
def coco_to_xyxy(box):
    x_min, y_min, width, height = box
    x_max = x_min + width
    y_max = y_min + height
    return [x_min, y_min, x_max, y_max]


def train_model(num_epochs=10, confidence_threshold=0.0, save_dir="/home/ryan/Documents/Deep_Learning"):
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for img, target in train_loader:  # Process images one at a time
            img = img.to(device)  # Move image to GPU
            img_id = target[0]['image_id'].item()  # Convert tensor to integer
            img_path = full_dataset.coco.loadImgs(img_id)[0]['file_name']
            img_full_path = f"{coco_root}/{img_path}"

            # Generate bounding box proposals using Selective Search
            proposals = get_selective_search_proposals(img_full_path)

            # Get ground truth boxes and labels
            gt_boxes = [ann['bbox'] for ann in target]
            gt_labels = [ann['category_id'] for ann in target]

            accumulated_loss = 0.0  # Accumulate loss for proposals

            optimizer.zero_grad()  # Zero gradients before processing proposals

            min_size = 20  # Minimum acceptable height/width

            accumulated_loss = torch.tensor(0.0, device=device)  # Initialize as a tensor

            print('seen all proposals')
            for (x, y, w, h) in proposals:
                # Ensure bounding box coordinates are valid and not too small
                if w <= min_size or h <= min_size or x < 0 or y < 0 or x + w > img.shape[3] or y + h > img.shape[2]:
                    continue

                # Crop and resize the region of interest (ROI)
                roi = img[:, :, y:y + h, x:x + w]
                roi_resized = torch.nn.functional.interpolate(roi, size=(224, 224), mode='bilinear', align_corners=False).to(device)

                gt_boxes_xyxy = [coco_to_xyxy(bbox) for bbox in gt_boxes]
                pred_box_xyxy = coco_to_xyxy([x, y, w, h])

                iou = compute_iou(gt_boxes_xyxy, [pred_box_xyxy])

                # Assign correct label based on IoU
                print(iou.max())
                if iou.max() > 0.5:
                    label = gt_labels[iou.argmax().item()]
                    print(label)
                else:
                    continue 

                labels = torch.tensor([label], dtype=torch.long).to(device)

                # Forward pass through the model
                outputs = model(roi_resized)
                probs = torch.softmax(outputs, dim=1)
                confidence, pred_label = torch.max(probs, 1)
                print(pred_label)
                # Skip low-confidence proposals
                if confidence.item() < confidence_threshold:
                    continue
                loss = criterion(outputs, labels)
                print(outputs, labels)
                # Accumulate loss across all high-confidence proposals
                accumulated_loss += loss

            if accumulated_loss > 0:
                accumulated_loss.backward()  # Backpropagate accumulated loss
                optimizer.step()  # Update model after processing all proposals

        # Perform validation
        val_loss = validate_model()
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_path = os.path.join(save_dir, f'resnet50_best.pth')
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved to {save_path}")
        else:
            print("Early stopping, validation loss did not improve.")
            break

    print("Training complete.")

# Train the model
train_model()


loading annotations into memory...
Done (t=6.69s)
creating index...
index created!
seen all proposals
tensor(0.0051)
tensor(0.2049)
tensor(0.0264)
tensor(0.0488)
tensor(0.0652)
tensor(0.0642)
tensor(0.1515)
tensor(0.0470)
tensor(0.1032)
tensor(0.1157)
tensor(0.0448)
tensor(0.0331)
tensor(0.1548)
tensor(0.)
tensor(0.1023)
tensor(0.0101)
tensor(0.0414)
tensor(0.2362)
tensor(0.0153)
tensor(0.0064)
tensor(0.0211)
tensor(0.0073)
tensor(0.0459)
tensor(0.5779)
tensor([6])
tensor([28])
tensor([[-0.0218, -0.1217, -0.1882, -0.1074, -0.2919,  0.0997,  0.1103,  0.0212,
         -0.1436,  0.1472,  0.0996, -0.0359, -0.0186, -0.1882,  0.0265, -0.2571,
          0.0514,  0.1464, -0.0256, -0.1701, -0.1303,  0.1301,  0.1774,  0.0331,
          0.0193, -0.1946,  0.2867, -0.0316,  0.3479,  0.1909,  0.2999,  0.1118,
         -0.0158, -0.1441, -0.1471, -0.2155,  0.0043,  0.0791,  0.3419, -0.1553,
         -0.0833,  0.0590,  0.3118, -0.0643,  0.0057,  0.2070,  0.2763,  0.2574,
          0.1620,  0.0859,  0.0

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
