In [None]:
import torch
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from torchvision.transforms import transforms
import torchvision
import torch.nn as nn
from math import sqrt
from torchvision.ops import nms,roi_pool
from tqdm.auto import tqdm

In [None]:
class VOC_dataset(Dataset):
    def __init__(self,file_dir,img_dir,label_dir,img_size,transforms=None):
        #file_dir - Path of the file containing 2 columns
        #1st - image file name, 2nd - Label file name
        self.annotations=pd.read_csv(file_dir)
        #img_dir - Path of the images folder
        self.img_dir=img_dir
        #label_dir - Path of the labels folder
        #Each label file contains class_number, x_center, y_center, width, height
        self.label_dir=label_dir
        self.img_size=img_size
        self.transforms=transforms
    def __getitem__(self,idx):
        img=Image.open(path.join(self.img_dir,self.annotations.iloc[idx,0]))
        if (self.transforms):
            img=self.transforms(img)
        labels_path=path.join(self.label_dir,self.annotations.iloc[idx,1])
        file_ref=open(labels_path,"r")
        final=[]
        for lst in file_ref.readlines():
            example=[]
            for label in lst.strip().split(" "):
                example.append(float(label))
            example=[int(example[0])]+[item*self.img_size for item in example[1:]]
            final.append(example)
        return img,final
    def __len__(self):
        return 10
        #return len(self.annotations)

In [None]:
def my_collate(batch):
    #batch - A list of tuples representing (images,bboxes)
    images=torch.stack([item[0] for item in batch])
    bboxes=[item[1] for item in batch]
    return [images,bboxes]

In [None]:
settings={
    "img_size": 800,
    "pre_nms_top_n": 12000,
    "nms_threshold": 0.7,
    "post_nms_top_n": 2000,
    "pos_threshold": 0.7,
    "neg_threshold": 0.3,
    "num_RPN_samples": 256,
    "fg_threshold": 0.5,
    "bg_threshold_hi": 0.5,
    "bg_threshold_lo": 0.1,
    "num_classification_samples": 1000, #Change
    "pool_size": 7,
    "ss_ratio": 16,
    "num_classes": 20,
    "feature_map_height": 50,
    "feature_map_width": 50,
    "feature_map_depth": 512,
    "anchor_ratios": [0.5,1,2],
    "anchor_scales": [8,16,32],
    "beta_rpn_loss": 1, #Change
    "lambd_rpn_loss": 1, #Change
    "beta_classifier_loss": 1, #Change
    "lambd_classifier_loss": 1, #Change
    "lr": 0.001 #Change
}

In [None]:
batch_size=1
mean_list=[0.485,0.456,0.406]
std_list=[0.229, 0.224, 0.225]

composed=transforms.Compose([transforms.ToTensor(),
                             transforms.Normalize(mean_list,std_list),
                             transforms.Resize((settings["img_size"],settings["img_size"]))])

train_set=VOC_dataset("../input/pascalvoc-yolo/100examples.csv",
                      "../input/pascalvoc-yolo/images",
                      "../input/pascalvoc-yolo/labels",
                      settings["img_size"],
                      composed)
train_loader=DataLoader(train_set,batch_size=batch_size,shuffle=False,collate_fn=my_collate)

In [None]:
def plot_image(image,bboxes,mean_tensor,std_tensor,classes):
    image=image*std_tensor+mean_tensor
    np_image=np.transpose(np.array(image),(1,2,0))
    plt.imshow(np_image)
    ax=plt.gca()
    #img_height,img_width,_=np_image.shape
    for bbox in bboxes:
        x_center,y_center,width,height=bbox[1],bbox[2],bbox[3],bbox[4]
        x_tl,y_tl=(x_center-width/2),(y_center-height/2)
        ax.add_patch(matplotlib.patches.Rectangle((x_tl,y_tl),width,height,linewidth=1,edgecolor='r',facecolor='none'))
        plt.text(x_tl,y_tl,classes[int(bbox[0])],backgroundcolor='r')

In [None]:
#Check out the training data

num_classes=20
mean_tensor=torch.tensor(mean_list).view(-1,1,1)
std_tensor=torch.tensor(std_list).view(-1,1,1)
classes=('aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
         'bus', 'car', 'cat', 'chair','cow',
         'diningtable', 'dog', 'horse', 'motorbike', 'person',
         'pottedplant','sheep', 'sofa', 'train', 'tvmonitor',
         'background')

#idx can have values [0,batch_size)
idx=0
example=iter(train_loader).next()
image=example[0][idx]
bboxes=example[1][idx]
plot_image(image,bboxes,mean_tensor,std_tensor,classes)

In [None]:
def anchors_loc(x,y,anchor_ratios,anchor_scales,ss_ratio):
    '''
        Inputs:
            x,y - Coordinate from the feature map
            anchor_ratios - A list of height/width ratios
            anchor_scales - A list of scales
            ss_ratio - Subsampling ratio, for VGG16 it is 16
        Output:
            anchors - len(anchor_scales)*len(anchor_ratios) x 4
    '''
    num_ratios=len(anchor_ratios)
    s=ss_ratio//2
    x_ctr=(2*x+1)*s
    y_ctr=(2*y+1)*s
    anchors=[]
    for ratio_idx,ratio in enumerate(anchor_ratios):
        for scale_idx,scale in enumerate(anchor_scales):
            current=num_ratios*ratio_idx+scale_idx
            h=ss_ratio*scale*sqrt(ratio)
            w=ss_ratio*scale*sqrt(1/ratio)
            anchors.append([x_ctr,y_ctr,w,h])
    return torch.tensor(anchors)

In [None]:
def anchor_generation(h,w,device,anchor_ratios=[0.5,1,2],anchor_scales=[8,16,32],ss_ratio=16):
    '''
        Inputs: 
            h,w - Height,width of the feature map
            anchor_ratios - A list of height/width ratios
            anchor_scales - A list of scales
            ss_ratio - Subsampling ratio, for VGG16 it is 16
        Output:
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
    '''
    anchors=torch.zeros((h,w,len(anchor_ratios)*len(anchor_scales),4))
    for y in range(h):
        for x in range(w):
            anchors[y,x]=anchors_loc(x,y,anchor_ratios,anchor_scales,ss_ratio)
    return anchors.reshape(-1,4).to(device)

In [None]:
def bbox_transform_inv(anchors,rpn_bboxes):
    '''
        Inputs:
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
            rpn_bboxes - Similar to anchors but contains regression coefficients
        Output:
            transformed bboxes - Target bboxes similar type and shape as that of anchors,
                                 of the form [x_tl,y_tl,x_br,y_br]
    '''
    x_ctrs,delta_x=anchors[:,0],rpn_bboxes[:,0]
    y_ctrs,delta_y=anchors[:,1],rpn_bboxes[:,1]
    w,delta_w=anchors[:,2],rpn_bboxes[:,2]
    h,delta_h=anchors[:,3],rpn_bboxes[:,3]
    pred_x_ctrs=(delta_x*w+x_ctrs).unsqueeze(-1)
    pred_y_ctrs=(delta_y*h+y_ctrs).unsqueeze(-1)
    pred_w=(torch.exp(delta_w)*w).unsqueeze(-1)
    pred_h=(torch.exp(delta_h)*h).unsqueeze(-1)
    pred_x_tls=pred_x_ctrs-0.5*pred_w
    pred_y_tls=pred_y_ctrs-0.5*pred_h
    pred_x_brs=pred_x_tls+pred_w
    pred_y_brs=pred_y_tls+pred_h
    return torch.cat([pred_x_tls,pred_y_tls,pred_x_brs,pred_y_brs],axis=1)

In [None]:
def clip_bboxes(transformed,img_size,device):
    '''
        Inputs: 
            transformed - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
            img_size - A integer value representing width and height of the input image
        Output:
            Clipped_bboxes - Similar type and shape as that of transformed
    '''
    zero_tensor=torch.tensor([0],device=device)
    size_tensor=torch.tensor([img_size],device=device)
    transformed[:,0]=torch.maximum(zero_tensor,transformed[:,0])
    transformed[:,1]=torch.maximum(zero_tensor,transformed[:,1])
    transformed[:,2]=torch.minimum(size_tensor,transformed[:,2])
    transformed[:,3]=torch.minimum(size_tensor,transformed[:,3])
    return transformed

In [None]:
def select_anchors(anchors,img_size):
    '''
     Inputs: 
        anchors - A tensor of shape (-1 x 4)
        img_size - An integer
    Outputs:
        selected_anchors - A tensor of shape (-1 x 4)
        a_idx - A tensor of shape [-1] 
            where,
                Every value is the index of selected anchors
    '''
    a_idx=torch.where(torch.logical_and(torch.logical_and(anchors[:,0]>=0,anchors[:,1]>=0),torch.logical_and(anchors[:,2]<img_size,anchors[:,3]<img_size)))[0]
    return anchors[a_idx,:],a_idx 

In [None]:
def proposal_layer(anchors,rpn_scores,rpn_bboxes,img_size,pre_nms_top_n,nms_threshold,post_nms_top_n,device,mode='train'):
    '''
        Inputs: 
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
            rpn_scores - A tensor of shape (-1 x 1)
            rpn_bboxes - Similar to anchors but contains regression coefficients
            img_size - An integer representing width and height of the input image
            pre_nms_top_n,nms_threshold,post_nms_top_n - Integers 
        Output:
            scores - A tensor of shape (-1,)
            proposals - A tensor of shape (-1 x 4)
            
    '''
    transformed=bbox_transform_inv(anchors,rpn_bboxes)
    #if (mode=='test'):
    clipped_bboxes=clip_bboxes(transformed,img_size,device)
    #else:
    #    clipped_bboxes,selected_idx=select_anchors(transformed,img_size)
    scores,indices=rpn_scores.view(-1).sort(descending=True)
    #Keep only those scores and indices of the crops whose tl and br coordinates lie inside the image
    #scores=scores[selected_idx]
    #indices=indices[selected_idx]
    indices=indices[:pre_nms_top_n]
    scores=scores[:pre_nms_top_n]
    bboxes=clipped_bboxes[indices,:]
    keep=nms(bboxes,scores,nms_threshold)
    m=min(keep.shape[0],post_nms_top_n)
    keep=keep[:m]
    proposals=bboxes[keep,:]
    scores=scores[keep]
    return scores,proposals 

In [None]:
def tl_br(anchors):
    '''
        Inputs: 
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
        Output:
            anchors - Similar to "input" but each row represents x_tl,y_tl,x_br,y_br  
    '''
    x=anchors[:,0]
    y=anchors[:,1]
    w=anchors[:,2]
    h=anchors[:,3]
    x_tl=x-w/2
    y_tl=y-h/2
    x_br=x_tl+w
    y_br=y_tl+h
    return torch.cat([x_tl.unsqueeze(-1),y_tl.unsqueeze(-1),x_br.unsqueeze(-1),y_br.unsqueeze(-1)],axis=1)

In [None]:
def get_iou_matrix(anchors,gt_bboxes,device):
    '''
        Inputs: 
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
                    and each row in the tensor is of the form [x_tl,y_tl,x_br,y_br]
            gt_bboxes - A list of lists
                where,
                    Every inner list is of the form [class_number,x_ctr,y_ctr,width,height]
        Output:
            iou_matrix - A tensor of shape (anchors.shape[0] x len(gt_bboxes))
    '''
    #anchors=tl_br(anchors)
    a_x1=anchors[:,0]
    a_y1=anchors[:,1]
    a_x2=anchors[:,2]
    a_y2=anchors[:,3]
    iou_matrix=torch.zeros((anchors.shape[0],len(gt_bboxes)),device=device)
    zero_tensor=torch.tensor([0],device=device)
    for gt_idx,gt_bbox in enumerate(gt_bboxes):
        _,gt_x_ctr,gt_y_ctr,gt_w,gt_h=gt_bbox
        gt_x1=torch.tensor(gt_x_ctr-gt_w/2,device=device)
        gt_y1=torch.tensor(gt_y_ctr-gt_h/2,device=device)
        gt_x2=gt_x1+gt_w
        gt_y2=gt_y1+gt_h
        x1=torch.maximum(a_x1,gt_x1)
        y1=torch.maximum(a_y1,gt_y1)
        x2=torch.minimum(a_x2,gt_x2)
        y2=torch.minimum(a_y2,gt_y2)
        intersection=torch.maximum(zero_tensor,(x2-x1))*torch.maximum(zero_tensor,(y2-y1))
        union=(a_x2-a_x1)*(a_y2-a_y1)+(gt_x2-gt_x1)*(gt_y2-gt_y1)-intersection
        iou_matrix[:,gt_idx]=intersection/(union+1e-8)
    return iou_matrix

In [None]:
def convert_select_anchors(anchors,img_size):
    '''
        Converts to x_tl,y_tl,x_br,y_br then,
        if x_tl,y_tl or x_br,y_br are outside the img_size, remove that particular anchor
        Inputs: 
            anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
            img_size - An integer
        Outputs:
            selected_anchors - A tensor of shape (-1 x 4)
            a_idx - A tensor of shape [-1] 
                where,
                    Every value is the index of selected anchors
    '''
    anchors=tl_br(anchors)
    a_idx=torch.where(torch.logical_and(torch.logical_and(anchors[:,0]>=0,anchors[:,1]>=0),torch.logical_and(anchors[:,2]<img_size,anchors[:,3]<img_size)))[0]
    return anchors[a_idx,:],a_idx

In [None]:
def anchor_target_layer(anchors,gt_bboxes,pos_threshold,neg_threshold,num_samples,img_size,rpn_scores,rpn_bboxes,device):
    '''
        Inputs: 
             anchors - A tensor of shape (h*w*num_anchors x 4)
                where,
                    num_anchors - len(anchor_ratios)*len(anchor_scales)
            gt_bboxes - A list of lists
                where,
                    Every inner list is of the form [class_number,x_ctr,y_ctr,width,height]
            pos_threshold - A floating point value for classifying anchor boxes as foreground
            neg_threshold - A floating point value for classifying anchor boxes as background
            num_samples - An integer that represents the size of the mini-batch
            img_size - A integer that represents the width and height of the input image
            rpn_scores - A tensor of shape (-1 x 1)
        Outputs:
            foreground_samples - A tensor of shape (-1 x 10)
                where,
                    Each row is of the form [1,foreground_prob,gt_tx,gt_ty,gt_tw,gt_th,tx,ty,tw,th]
            background_samples - A tensor of shape (-1 x 2)
                where,
                        Each row is of the form [0,foreground_prob]
    '''
    #Convert list to tensor
    tensor_gt=torch.tensor(gt_bboxes,device=device)
    #num_class foreground and num_class background samples
    num_class_samples=num_samples//2
    #Remove anchors that are outside the image
    s_anchors,s_idx=convert_select_anchors(anchors,img_size)
    #Get rpn_scores only for the selected anchors
    s_rpn_scores=rpn_scores[s_idx,:]
    #Get rpn_bboxes only for selected anchors
    s_rpn_bboxes=rpn_bboxes[s_idx,:]
    #Create a frame to store class information and actual regression coefficients
    actual=-1*torch.ones((s_anchors.shape[0],10),device=device)
    #Get the overlap information of every selected anchor with every gt bbox
    iou_matrix=get_iou_matrix(s_anchors,gt_bboxes,device)
    #Find type A anchors
    max_overlap_values=torch.max(iou_matrix,axis=0)[0].unsqueeze(0)
    anchor_idx,gt_idx=torch.where(iou_matrix==max_overlap_values)
    #Construct actual tensor for type A foregrounds
    actual[anchor_idx,0]=1
    actual[anchor_idx,1]=s_rpn_scores[anchor_idx,0]
    actual[anchor_idx,2]=(tensor_gt[gt_idx,1]-s_anchors[anchor_idx,0])/s_anchors[anchor_idx,2]
    actual[anchor_idx,3]=(tensor_gt[gt_idx,2]-s_anchors[anchor_idx,1])/s_anchors[anchor_idx,3]
    actual[anchor_idx,4]=torch.log(tensor_gt[gt_idx,3]/s_anchors[anchor_idx,2])
    actual[anchor_idx,5]=torch.log(tensor_gt[gt_idx,4]/s_anchors[anchor_idx,3])
    actual[anchor_idx,6:]=s_rpn_bboxes[anchor_idx,:]
    #Find type B anchors
    s_anchor_overlaps=torch.max(iou_matrix,axis=1)[0].unsqueeze(-1)
    anchor_idx,gt_idx=torch.where(torch.logical_and(s_anchor_overlaps==iou_matrix,s_anchor_overlaps>pos_threshold))
    #Construct actual tensor for type B foregrounds
    actual[anchor_idx,0]=1
    actual[anchor_idx,1]=s_rpn_scores[anchor_idx,0]
    actual[anchor_idx,2]=(tensor_gt[gt_idx,1]-s_anchors[anchor_idx,0])/s_anchors[anchor_idx,2]
    actual[anchor_idx,3]=(tensor_gt[gt_idx,2]-s_anchors[anchor_idx,1])/s_anchors[anchor_idx,3]
    actual[anchor_idx,4]=torch.log(tensor_gt[gt_idx,3]/s_anchors[anchor_idx,2])
    actual[anchor_idx,5]=torch.log(tensor_gt[gt_idx,4]/s_anchors[anchor_idx,3])
    actual[anchor_idx,6:]=s_rpn_bboxes[anchor_idx,:]
    #Find background anchors
    background_mask=(s_anchor_overlaps.squeeze(-1)<neg_threshold)
    anchor_idx=torch.where(background_mask==True)[0]
    #Construct actual tensor for backgrounds
    actual[anchor_idx,0]=0
    actual[anchor_idx,1]=s_rpn_scores[anchor_idx,0]
    #Create indices for foreground and background samples
    (pos_idx,)=torch.where(actual[:,0]==1)
    (neg_idx,)=torch.where(actual[:,0]==0)
    pos=min(pos_idx.shape[0],num_class_samples)
    neg=min(neg_idx.shape[0],num_class_samples)
    pos_idx=pos_idx[:pos]
    neg_idx=neg_idx[:neg]
    foreground_samples=actual[pos_idx,:]
    background_samples=actual[neg_idx,:2]
    return foreground_samples,background_samples

In [None]:
'''
anchors=torch.tensor([[2,2,2,2],[3,3,2,2],[-3,2,2,2],[501,0,3,4],[4,6,2,2],[2,2,2,2]])
gt_bboxes=[[0,2,2,2,2],[1,3,3,2,2]]
pos_threshold=0.7
neg_threshold=0.2
num_samples=256
img_size=500
rpn_scores=torch.tensor([[0.8],[0.44],[0.3],[0.2],[0.7],[0.9]])
rpn_bboxes=torch.tensor([[1,2,3,4],[4,3,2,1],[9,8,7,6],[3,5,8,9],[6,7,8,9],[1,8,0,1]],dtype=torch.float)
fs,bs=anchor_target_layer(anchors,gt_bboxes,pos_threshold,neg_threshold,num_samples,img_size,rpn_scores,rpn_bboxes,'cpu')
print(fs)
print(bs)
'''

In [None]:
class RPN(nn.Module):
    def __init__(self,in_channels,num_anchors):
        super().__init__()
        self.conv=nn.Conv2d(in_channels,512,3,padding=1)
        self.s1=nn.Conv2d(512,num_anchors,1)
        self.s2=nn.Conv2d(512,num_anchors*4,1)
        self.relu=nn.ReLU()
    def forward(self,x):
        x=self.relu(self.conv(x))
        scores=torch.sigmoid(self.s1(x))
        offsets=self.s2(x)
        return scores,offsets

In [None]:
def weights_init(m):
    if (type(m)==nn.Conv2d):
        nn.init.normal_(m.weight,mean=0,std=0.001)
        nn.init.zeros_(m.bias)

In [None]:
def convert_to_center(proposals):
    x_tl=proposals[:,0]
    y_tl=proposals[:,1]
    x_br=proposals[:,2]
    y_br=proposals[:,3]
    x_ctr=((x_tl+x_br)/2).unsqueeze(-1)
    y_ctr=((y_tl+y_br)/2).unsqueeze(-1)
    w=(x_br-x_tl).unsqueeze(-1)
    h=(y_br-y_tl).unsqueeze(-1)
    return torch.cat([x_ctr,y_ctr,w,h],axis=1)

In [None]:
def proposal_target_layer(proposals,gt_bboxes,fr_threshold,bg_threshold_hi,bg_threshold_lo,batch_size,num_classes,device):
    '''
        Inputs:
            proposals - Transformed and clipped boxes of the form [x_tl,y_tl,x_br,y_br], 
                shape - (-1 x 4)
             gt_bboxes - A list of lists
                where,
                    Every inner list is of the form [class_number,x_ctr,y_ctr,width,height]
            fr_threshold - threshold for foreground samples (Default: 0.5)
            bg_threshold_hi - Upper threshold for background samples (Default: 0.5)
            bg_threshold_lo - Lower threshold for background samples (Default: 0.1)
            batch_size - Number of samples to create for Classification network (Default: 128)
            num_classes - An integer value
    '''
    num_class_samples=batch_size//2
    gt_tensor=torch.tensor(gt_bboxes,dtype=torch.float,device=device)
    samples=-1*torch.ones((proposals.shape[0],6),device=device)
    iou_matrix=get_iou_matrix(proposals,gt_bboxes,device)
    max_overlaps,gt_idx=torch.max(iou_matrix,axis=1)
    #Find foreground samples
    fg_idx=torch.where(max_overlaps>fr_threshold)[0]
    samples[fg_idx,:]=0
    samples[fg_idx,0]=1
    associated_gts_fg=gt_idx[fg_idx]
    samples[fg_idx,1]=gt_tensor[associated_gts_fg,0]
    #find background samples
    bg_idx=torch.where(torch.logical_and(max_overlaps<bg_threshold_hi,max_overlaps>bg_threshold_lo))[0]
    samples[bg_idx,:]=0
    associated_gts_bg=gt_idx[bg_idx]
    samples[bg_idx,1]=num_classes
    #Concatenate indices
    indices=torch.cat([fg_idx,bg_idx],axis=0)
    associated_gts=torch.cat([associated_gts_fg,associated_gts_bg],axis=0)
    #Convert proposals
    converted_proposals=convert_to_center(proposals)
    #Calculate regression coefficients
    samples[indices,2]=(gt_tensor[associated_gts,1]-converted_proposals[indices,0])/converted_proposals[indices,2]
    samples[indices,3]=(gt_tensor[associated_gts,2]-converted_proposals[indices,1])/converted_proposals[indices,3]
    samples[indices,4]=torch.log(gt_tensor[associated_gts,3]/converted_proposals[indices,2])
    samples[indices,5]=torch.log(gt_tensor[associated_gts,4]/converted_proposals[indices,3])
    #Create foreground and background samples
    fg_samples_idx=torch.where(samples[:,0]==1)[0]
    bg_samples_idx=torch.where(samples[:,0]==0)[0]
    num_fgs=min(fg_samples_idx.shape[0],num_class_samples)
    num_bgs=min(bg_samples_idx.shape[0],num_class_samples)
    final_fgs_idx=fg_samples_idx[:num_fgs]
    final_bgs_idx=bg_samples_idx[:num_bgs]
    a=torch.cat([proposals[final_fgs_idx,:],proposals[final_bgs_idx,:]],axis=0)
    b=torch.cat([torch.arange(end=a.shape[0],dtype=torch.float).view(-1,1),a],axis=1)
    return torch.cat([samples[final_fgs_idx,1:],samples[final_bgs_idx,1:]],axis=0),b

In [None]:
'''
proposals=torch.tensor([[1,1,3,3],[2,1,4,3],[5,1,6,3],[0,1,3,3]])
gt_bboxes=[[1,2,2,2,2]]
samples,ROI=proposal_target_layer(proposals,gt_bboxes,0.5,0.5,0.1,128,20,'cpu')
print(samples)
print(ROI)
'''

In [None]:
def create_labels_with_mask(samples,num_classes,device):
    '''
        Input:
            samples - A tensor of shape - (-1 x 5)
        Outputs:
            mask - A tensor of shape - (-1 x 80)
            gts - A tensor of shape - (-1 x 81)
                where,
                    1st value is the class number and the other 80 are the actual regression coefficients
    '''
    class_idx=samples[:,0].type(torch.long)
    num_rows=samples.shape[0]
    gts=torch.zeros((num_rows,1+4*(num_classes+1)),device=device)
    mask=torch.zeros((num_rows,4*(num_classes+1)),device=device)
    gts[:,0]=class_idx
    rows=torch.tensor(range(num_rows))
    start_idx=class_idx*4
    gts[rows,start_idx+1]=samples[:,1]
    mask[rows,start_idx]=1
    gts[rows,start_idx+2]=samples[:,2]
    mask[rows,start_idx+1]=1
    gts[rows,start_idx+3]=samples[:,3]
    mask[rows,start_idx+2]=1
    gts[rows,start_idx+4]=samples[:,4]
    mask[rows,start_idx+3]=1
    return gts[:,:4*num_classes+1],mask[:,:4*num_classes]

In [None]:
class Classifier(nn.Module):
    def __init__(self,in_channels,pool_size,num_classes):
        super().__init__()
        self.flat=in_channels*pool_size*pool_size
        self.lin1=nn.Linear(in_channels*pool_size*pool_size,4096)
        self.lin2=nn.Linear(4096,4096)
        self.relu=nn.ReLU()
        self.score_layer=nn.Linear(4096,num_classes+1)
        self.offset_layer=nn.Linear(4096,4*num_classes)
    def forward(self,x):
        x=x.view(-1,self.flat)
        x=self.relu(self.lin1(x))
        x=self.relu(self.lin2(x))
        class_scores=self.score_layer(x)
        offsets=self.offset_layer(x)
        return class_scores,offsets

In [None]:
class ClassifierLoss(nn.Module):
    def __init__(self,beta,lambd):
        super().__init__()
        self.CEL=nn.CrossEntropyLoss()
        self.SmoothL1L=nn.SmoothL1Loss(beta=beta)
        self.lambd=lambd
    def forward(self,gts,mask,class_scores,offsets):
        class_loss=self.CEL(class_scores,gts[:,0])
        offset_loss=self.SmoothL1L(gts[:,1:],offsets*mask)
        return class_loss+self.lambd*offset_loss

In [None]:
class Network(nn.Module):
    def __init__(self,settings,num_anchors,device):
        super().__init__()
        self.RPN=RPN(settings["feature_map_depth"],num_anchors)
        self.RPN.apply(weights_init)
        self.img_size=settings["img_size"]
        self.pre_nms_top_n=settings["pre_nms_top_n"] 
        self.nms_threshold=settings["nms_threshold"] 
        self.post_nms_top_n=settings["post_nms_top_n"]  
        self.pos_threshold=settings["pos_threshold"] 
        self.neg_threshold=settings["neg_threshold"] 
        self.num_RPN_samples=settings["num_RPN_samples"] 
        self.fg_threshold=settings["fg_threshold"]
        self.bg_threshold_hi=settings["bg_threshold_hi"]
        self.bg_threshold_lo=settings["bg_threshold_lo"]
        self.num_classification_samples=settings["num_classification_samples"]
        self.pool_size=settings["pool_size"]
        self.ss_ratio=settings["ss_ratio"]
        self.num_classes=settings["num_classes"]
        self.classifier=Classifier(settings["feature_map_depth"],settings["pool_size"],
                                   self.num_classes)
        self.generated_anchors=anchor_generation(settings["feature_map_height"],
                                                 settings["feature_map_width"],
                                                 device,
                                                 settings["anchor_ratios"],
                                                 settings["anchor_scales"],
                                                 self.ss_ratio)
        self.device=device
    def forward(self,feature_map,gt_bboxes):
        rpn_scores,rpn_bboxes=self.RPN(feature_map)
        rpn_scores=rpn_scores.view(-1,1)
        rpn_bboxes=rpn_bboxes.view(-1,4)
        foreground_samples,background_samples=anchor_target_layer(self.generated_anchors,gt_bboxes,
                                                                  self.pos_threshold,self.neg_threshold,
                                                                  self.num_RPN_samples,self.img_size,
                                                                  rpn_scores,rpn_bboxes,self.device)
        _,proposals=proposal_layer(self.generated_anchors,rpn_scores,
                                   rpn_bboxes,self.img_size,
                                   self.pre_nms_top_n,
                                   self.nms_threshold,self.post_nms_top_n,
                                   self.device)
        samples,variable_sized_ROIs=proposal_target_layer(proposals,gt_bboxes,
                                              self.fg_threshold,self.bg_threshold_hi,
                                              self.bg_threshold_lo,self.num_classification_samples,
                                              self.num_classes,self.device)
        gts,mask=create_labels_with_mask(samples,self.num_classes,self.device)
        fix_sized_ROIs=roi_pool(feature_map,variable_sized_ROIs,
                                (self.pool_size,self.pool_size),
                                self.ss_ratio)
        class_scores,final_offsets=self.classifier(fix_sized_ROIs)
        return foreground_samples,background_samples,gts,mask,class_scores,final_offsets
    def inference(self):
        pass

In [None]:
class RPN_loss(nn.Module):
    def __init__(self,beta,lambd,device):
        super().__init__()
        self.lambd=lambd
        self.smooth_l1=nn.SmoothL1Loss(beta=beta)
        self.BCELoss=nn.BCELoss()
        self.device=device
    def forward(self,foreground_samples,background_samples):
        foreground_scores=torch.ones(foreground_samples.shape[0],device=self.device)
        background_scores=torch.zeros(background_samples.shape[0],device=self.device)
        gt_scores=torch.cat([foreground_scores,background_scores],axis=0)
        pred_scores=torch.cat([foreground_samples[:,1].view(-1),background_samples[:,1].view(-1)],axis=0)
        loss_cls=self.BCELoss(pred_scores,gt_scores)
        loss_coords=self.smooth_l1(foreground_samples[:,2:6],foreground_samples[:,6:])
        return loss_cls+self.lambd*loss_coords

In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
num_anchors=len(settings["anchor_ratios"])*len(settings["anchor_scales"])

feature_extractor=torchvision.models.vgg16(pretrained=True).features[:30] #Used till conv5_3
feature_extractor.eval()
feature_extractor=feature_extractor.to(device)
net=Network(settings,num_anchors,device).to(device)
get_rpn_loss=RPN_loss(settings["beta_rpn_loss"],settings["lambd_rpn_loss"],device)
get_classifier_loss=ClassifierLoss(settings["beta_classifier_loss"],settings["lambd_classifier_loss"])
optimizer=torch.optim.Adam(net.parameters(),lr=settings["lr"])

In [None]:
num_examples=len(train_loader)
num_epochs=1

for epoch in range(num_epochs):
    avg_loss=0
    for x,y in tqdm(train_loader,leave=False):
        gt_bboxes=y[0]
        feature_map=feature_extractor(x.to(device))
        fp_result=net(feature_map,gt_bboxes)
        rpn_loss=get_rpn_loss(fp_result[0],fp_result[1])
        classifier_loss=get_classifier_loss(fp_result[2],fp_result[3],fp_result[4],fp_result[5])
        loss=rpn_loss+classifier_loss
        optimizer.zero_grad()
        loss.backward()
        avg_loss+=loss.item()/num_examples
        optimizer.step()
    print(f"Epoch {epoch+1}, Average loss: {avg_loss:.5f}")