In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.datasets as datasets
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.cuda import amp

from src.pretrain_detnet import detnet59_fpn
from src.RCNN import MaskRCNN, RPN
from src.pascal_voc import VocDetectionData, VOC_CLASSES
from src.criterion import RPNLoss
import os

import cv2

from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2

In [2]:
cwd = os.getcwd()
pretrained_path = os.path.join(cwd, "saved_model_working/pretrained_backbone.pth.tar")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
rpn_net = RPN(pretrained_path).to(device)
criterion = RPNLoss().to(device)
optimizer = torch.optim.SGD(rpn_net.parameters(), lr=0.001)
schedular = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)

In [4]:
data = VocDetectionData(root=cwd, size=800)

Initializing dataset


In [5]:
train_loader = DataLoader(data, batch_size=1, shuffle=True, pin_memory=True)

In [8]:
scaler = amp.GradScaler()
losses = []
rpn_net.train()
for epoch in range(50):
    print("epoch " + str(epoch))
    for i, data in enumerate(train_loader):
        image, labels, bboxes, fname = data
        image = image.to(device)
        bboxes = bboxes.to(device)

        with amp.autocast():
            rpn_reg_scores, anchors, rpn_bboxes_xywh, rpn_bboxes_xyxy, rpn_obj_scores = rpn_net(image)
            loss = criterion(rpn_reg_scores, rpn_obj_scores, rpn_bboxes_xyxy, bboxes.detach()[0], anchors.detach())

        losses.append(loss.detach().item())
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        schedular.step()

        if i % 1000 == 0:
            print("itr: " + str(i) + " loss: " + str(loss))

        del loss
        del rpn_reg_scores
        del anchors
        del rpn_bboxes_xywh
        del rpn_bboxes_xyxy
        del rpn_obj_scores
        del image
        del labels
        del bboxes
        del fname

    

epoch 0
itr: 0 loss: tensor(0.7017, device='cuda:0', grad_fn=<AddBackward0>)


KeyboardInterrupt: 

In [9]:
losses

[0.7017409205436707,
 0.7115809321403503,
 0.6995367407798767,
 0.7034119367599487,
 0.7005923390388489,
 0.7067487835884094,
 0.6956576108932495,
 0.6968466639518738,
 0.6942546963691711,
 0.6952702403068542,
 0.6929062604904175,
 0.6918022632598877,
 0.7049766182899475,
 0.6861869096755981,
 0.7184505462646484,
 0.6982505321502686,
 0.7006296515464783,
 0.6996737718582153,
 0.6976303458213806,
 0.7139388918876648,
 0.6975721716880798,
 0.6930006742477417,
 0.6977785229682922,
 0.6904575228691101,
 0.6994780898094177,
 0.6983123421669006,
 0.7040213346481323,
 0.6985630393028259,
 0.6977686882019043,
 0.702555775642395,
 0.6891095042228699,
 0.6998506784439087,
 0.702961802482605,
 0.6995626091957092,
 0.6954851746559143,
 0.6984736919403076,
 0.6990086436271667,
 0.6985742449760437,
 0.7100406885147095,
 0.7083916068077087,
 0.6976839303970337,
 0.6938945055007935,
 0.7018881440162659]

In [None]:
def draw_bboxes(image, label, bbox):
    image = image.permute(1, 2, 0).numpy()
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    print(bbox)
    for i in range(bbox.shape[0]):
        box = bbox[i]
        p1 = (int(box[0] * image.shape[1]), int(box[1] * image.shape[0]))
        p2 = (int(box[2] * image.shape[1]), int(box[3] * image.shape[0]))
        cv2.rectangle(image, p1, p2, color=[128, 0, 0], thickness=2)
        text_size, baseline = cv2.getTextSize(VOC_CLASSES[label[i]], cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
        p1 = (p1[0], p1[1] - text_size[1])
        cv2.rectangle(image, (p1[0] - 2 // 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]),
                    [128, 0, 0], -1)

        cv2.putText(image, VOC_CLASSES[label[i]], (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1, 8)

    plt.figure(figsize = (15,15))
    plt.imshow(image)

In [None]:
detection_data = datasets.VOCDetection(root=cwd, year="2012", image_set="trainval", transform=transforms.ToTensor())

In [None]:
detection_data[0][1]["annotation"]["object"]

[{'name': 'tvmonitor',
  'pose': 'Frontal',
  'truncated': '0',
  'occluded': '0',
  'bndbox': {'xmin': '34', 'ymin': '11', 'xmax': '448', 'ymax': '293'},
  'difficult': '0'}]

In [None]:
a = torch.rand(4, 2)
print(a)
b = a > 0.8
print(b)
for i in range(2):
    if b[:, i].sum() == 0:
        b[a[:, i].argmax(), i] = True
print(b)
print(b.nonzero().shape)

tensor([[0.6592, 0.3873],
        [0.9675, 0.4224],
        [0.7821, 0.7219],
        [0.1726, 0.4306]])
tensor([[False, False],
        [ True, False],
        [False, False],
        [False, False]])
tensor([[False, False],
        [ True, False],
        [False,  True],
        [False, False]])
torch.Size([2, 2])


device(type='cpu')

In [None]:
a = torch.rand(2, 3, 4)
print(a)
print(a.view(-1, 2))
b = torch.rand(12, 2)
print()
print(b.view(2, 3, 4))


tensor([[[0.8293, 0.7149, 0.7835, 0.0978],
         [0.2139, 0.0152, 0.1582, 0.1765],
         [0.3522, 0.1909, 0.5115, 0.7426]],

        [[0.8155, 0.4853, 0.4972, 0.6358],
         [0.1679, 0.9749, 0.3284, 0.6902],
         [0.8101, 0.2171, 0.9477, 0.4416]]])
tensor([[0.8293, 0.7149],
        [0.7835, 0.0978],
        [0.2139, 0.0152],
        [0.1582, 0.1765],
        [0.3522, 0.1909],
        [0.5115, 0.7426],
        [0.8155, 0.4853],
        [0.4972, 0.6358],
        [0.1679, 0.9749],
        [0.3284, 0.6902],
        [0.8101, 0.2171],
        [0.9477, 0.4416]])
tensor([[0.4901, 0.2967],
        [0.8392, 0.2936],
        [0.1382, 0.7315],
        [0.8575, 0.0733],
        [0.8191, 0.6934],
        [0.8408, 0.7207],
        [0.2457, 0.7487],
        [0.3968, 0.3466],
        [0.6786, 0.2178],
        [0.7180, 0.5348],
        [0.8606, 0.9914],
        [0.7695, 0.0661]])
tensor([[[0.4901, 0.2967, 0.8392, 0.2936],
         [0.1382, 0.7315, 0.8575, 0.0733],
         [0.8191, 0.6934, 