In [1]:
%matplotlib inline

import os
import re
import sys
import yaml
import random

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

seed=0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

## slower, more reproducible
#cudnn.benchmark, cudnn.deterministic = False, True
## faster, less reproducible
#cudnn.benchmark, cudnn.deterministic = True, False

<torch._C.Generator at 0x1bfeb605c70>

In [2]:

%load_ext autoreload
%autoreload 2

sys.path.append('../../../')
from video_processing.yolov7.parameter_parser import parser
from video_processing.yolov7.models.model import Model
from video_processing.yolov7.train.utils import setup_optimizer, labels_to_class_weights
from video_processing.yolov7.dataset.coco_dataset import LoadImagesAndLabels
from video_processing.yolov7.utils.general import one_cycle, check_image_size
from video_processing.yolov7.dataset.anchors import check_anchor_matching

In [3]:
data_dirpath='D:/data/coco'
result_dirpath='D:/results/yolov7'

argument=f"""
--data-dirpath {data_dirpath}/coco --output-dirpath {result_dirpath} 
--worker 1 --device cpu --batch-size 2 --data coco.yaml --img 1280 1280 --cfg yolov7-w6.yaml
--weights ''  --name yolov7-w6 --hyp hyp.scratch.p6.yaml 
--n-training-data 100 --n-val-data 20 --correct-exif
"""
args=parser.parse_args(argument.split())

device=torch.device('cpu' if not torch.cuda.is_available() or args.device=='cpu' else 'cuda')
print(device, args.batch_size)

# hyperparameters
with open(args.hyp) as f: hyp=yaml.load(f, Loader=yaml.SafeLoader)

cpu 2


In [4]:
nc=80
# define model and optimizers
model=Model(args.cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)  # it is safer to move model to device first and then create optimizer


In IAxDetect nl: 4 na: 3
In IAxDetect anchors: torch.Size([4, 3, 2]) 4x3x2
In IAxDetect anchor_grid: torch.Size([4, 1, 3, 1, 1, 2]) 4x1x3x1x1x2


In [5]:
with open(args.data) as f: data_dict=yaml.load(f, Loader=yaml.SafeLoader)
# train/val data loader
train_dataset=LoadImagesAndLabels(data_dirpath=args.data_dirpath, image_paths=data_dict['train'], img_size=args.img_size[0],
                            augment=True, hyp=hyp, n_data=args.n_training_data, correct_exif=args.correct_exif)
train_loader=torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, num_workers=1, pin_memory=True, 
                                        collate_fn=LoadImagesAndLabels.collate_fn)

In dataset.coco_dataset.__init__ save cache to D:\data\coco\coco\labels\train2017.cache cache_path.is_file() True


In [6]:
import torch.nn.functional as F
from video_processing.yolov7.loss.module import ComputeLossAuxOTA

In [7]:

# blending factor between fixed objectness of 1 and IoU between prediction and ground truth
# used to set target objectness, i.e., target_objectness = (1-gr)+gr*iou
model.gr=1.
loss_module=ComputeLossAuxOTA(model, cls_pw=hyp['cls_pw'], obj_pw=hyp['obj_pw'], label_smoothing=args.label_smoothing)
print(hyp['obj'], hyp['cls'], hyp['box'])


0.7 0.3 0.05


In [8]:
imgs, targets, paths=next(iter(train_loader))
print('imgs ', imgs.shape, imgs.dtype, imgs.min().item(), imgs.max().item())
print('targets ', targets.shape, targets.dtype, [f'{x:.2f}' for x in targets.min(0).values.tolist()], 
      [f'{x:.2f}' for x in targets.max(0).values.tolist()])

imgs  torch.Size([2, 3, 1280, 1280]) torch.uint8 0 154
targets  torch.Size([4, 6]) torch.float32 ['0.00', '0.00', '0.50', '0.40', '0.06', '0.06'] ['1.00', '32.00', '0.67', '0.71', '1.00', '0.60']


In [9]:
imgs=imgs.to(device, non_blocking=True).float() / 255.0
predictions = model(imgs)  # forward
print(type(predictions), type(predictions[0]), len(predictions), [p.shape for p in predictions])
#loss, loss_items = compute_loss_ota(pred, targets.to(device), imgs)  #


<class 'list'> <class 'torch.Tensor'> 8 [torch.Size([2, 3, 160, 160, 85]), torch.Size([2, 3, 80, 80, 85]), torch.Size([2, 3, 40, 40, 85]), torch.Size([2, 3, 20, 20, 85]), torch.Size([2, 3, 160, 160, 85]), torch.Size([2, 3, 80, 80, 85]), torch.Size([2, 3, 40, 40, 85]), torch.Size([2, 3, 20, 20, 85])]


In [11]:
loss_module(predictions, targets, images=imgs,matching_threshold=hyp['anchor_t'],box_weight=hyp['box'], obj_weight=hyp['obj'], cls_weight=hyp['cls'])

(tensor([2.4002], grad_fn=<MulBackward0>),
 tensor([0.1461, 0.9509, 0.1031, 1.2001]),
 tensor([2.9214, 1.3585, 0.3436, 1.2001]))

In [127]:
print(len(predictions), [p.shape for p in predictions])
len(targets), [t.shape for t in targets]
imgs.shape

8 [torch.Size([2, 3, 160, 160, 85]), torch.Size([2, 3, 80, 80, 85]), torch.Size([2, 3, 40, 40, 85]), torch.Size([2, 3, 20, 20, 85]), torch.Size([2, 3, 160, 160, 85]), torch.Size([2, 3, 80, 80, 85]), torch.Size([2, 3, 40, 40, 85]), torch.Size([2, 3, 20, 20, 85])]


torch.Size([2, 3, 1280, 1280])

In [14]:
predictions (list[Tensor]): list of 2NL BxAxHxWxO output where NL is the number of levels, (2 for output from main and auxillary heads)
targets (list[Tensor]): list of Ntx6 targets per level, where Nt is the number of targets which may vary per level and 6 for image-index,
    class-index, x,y,w,h in normalized space relative to image width and height. (x,y) is the box center
imgs (Tensor[float]): BxCxHxW

torch.Size([2, 3, 1280, 1280])

In [13]:

# def __call__(self, predictions, targets, images):
# bs_aux, as_aux_, gjs_aux, gis_aux, targets_aux, anchors_aux = self.build_targets2(p[:self.nl], targets, imgs)
# find the targets in cell grid unit that match anchors for training auxillary head
indices4aux, anch4aux =find_5_positive(prediction=predictions[:loss_module.nl], targets=targets, anchors=loss_module.anchors,
                                             matching_threshold=hyp['anchor_t'], inside_grid_cell=1.)
bs_aux, as_aux_, gjs_aux, gis_aux, targets_aux, anchors_aux \
=determine_matching_targets(prediction=predictions[:loss_module.nl], targets=targets, indices=indices4aux, anch=anch4aux,
                                               stride=loss_module.stride, image_size=imgs.shape[2], n_classes=loss_module.nc)

indices4main, anch4main=find_5_positive(prediction=predictions[:loss_module.nl], targets=targets, anchors=loss_module.anchors,
                                             matching_threshold=hyp['anchor_t'], inside_grid_cell=.5)
bs, as_,gjs, gis, targets, anchors \
=determine_matching_targets(prediction=pred[:loss_module.nl], targets=targets, indices=indices4main, anch=anch4main,
                                               stride=loss_module.stride, image_size=imgs.shape[2], n_classes=loss_module.nc)

In [19]:
# list of 1D of WHWH (XYXY) grid size per level
feature_grid_resolution=[torch.tensor(pred.shape, device=device)[[3,2,3,2]] for pred in predictions[:loss_module.nl]]
print(feature_grid_resolution)

[tensor([160, 160, 160, 160]), tensor([80, 80, 80, 80]), tensor([40, 40, 40, 40]), tensor([20, 20, 20, 20])]


In [123]:
box_loss=torch.zeros(1, device=device)
class_loss=torch.zeros(1, device=device)
objectness_loss=torch.zeros(1, device=device) # objectness
for level in range(loss_module.nl):
    print(level, '-'*100)
    pred_l=predictions[level] # BxAxHxWxO prediction for level l 
    pred_aux_l=predictions[loss_module.nl+level] # BxAxHxWxO prediction from auxillary head for level l 
    # image-index, anchor-index, grid-j, grid-i
    b, a, gj, gi=bs[level], as_[level], gjs[level], gis[level] # all 1D long indices
    b_aux,a_aux, gj_aux, gi_aux=bs_aux[level], as_aux_[level], gjs_aux[level], gis_aux[level] # all 1D long indices
    target_objectness=torch.zeros_like(pred_l[...,0], device=device) # BxAxHxW
    target_objectness_aux=torch.zeros_like(pred_aux_l[...,0], device=device) # BxAxHxW
    
    n_targets=b.shape[0]
    if n_targets>0:
        # predictions corresponding to targets
        positive_pred_l=pred_l[b,a,gj,gi] # n_targets x O
        iou, iou_loss=box_regression(positive_pred_l[:,:4], target_boxes=targets[level][:,2:]*feature_grid_resolution[level][None], 
               grid_cell=torch.stack([gi, gj], dim=1), anchors=anchors[level])
        box_loss+=iou_loss

        # BxAxHxW target objectness blending iou and fixed objectness of 1
        target_objectness[b,a,gj,gi]=(1.-loss_module.gr)+loss_module.gr*iou.detach().clamp(min=0.).type(target_objectness.dtype)

        # classification
        if loss_module.nc>1: # only for multiple classes
            class_loss+=multilabel_classification_loss(predictions=positive_pred_l[:,5:], target_class_indices=targets[level][:,1].long(), 
                                           pos_weight=loss_module.class_positive_weight, pos_value=loss_module.positive_class, 
                                           neg_value=loss_module.negative_class)
    n_aux=b_aux.shape[0] # number of target for auxillary head
    if n_aux>0:
        positive_pred_aux_l=pred_aux_l[b_aux, a_aux, gj_aux, gi_aux]
        iou_aux, iou_aux_loss=box_regression(positive_pred_aux_l[:,:4], target_boxes=targets_aux[level][:,2:]*feature_grid_resolution[level][None], 
                       grid_cell=torch.stack([gi_aux, gj_aux], dim=1), anchors=anchors_aux[level])
        box_loss+=0.25*iou_aux_loss

        # objectness target
        target_objectness_aux[b_aux, a_aux, gj_aux, gi_aux]=(1.-loss_module.gr) + \
        loss_module.gr*iou_aux.detach().clamp(0).type(target_objectness_aux.dtype)

        if loss_module.nc>1:
            class_loss+=0.25*multilabel_classification_loss(predictions=positive_pred_aux_l[:,5:], target_class_indices=targets_aux[level][:,1].long(), 
                               pos_weight=loss_module.class_positive_weight, pos_value=loss_module.positive_class, 
                               neg_value=loss_module.negative_class)
    # objectness losses
    obj_main_loss=F.binary_cross_entropy_with_logits(input=pred_l[...,4], target=target_objectness, pos_weight=loss_module.object_positive_weight)
    obj_aux_loss=F.binary_cross_entropy_with_logits(input=pred_aux_l[...,4], target=target_objectness_aux, pos_weight=loss_module.object_positive_weight)
    objectness_loss+=loss_module.balance[level]*(obj_main_loss+0.25*obj_aux_loss)
    

loss=loss_module.box_weight*box_loss+loss_module.cls_weight*class_loss+loss_module.obj_weight*objectness_loss
float(imgs.shape[0])*loss, torch.cat((loss_module.box_weight*box_loss, loss_module.obj_weight*objectness_loss, loss_module.cls_weight*class_loss, loss)).detach()

0 ----------------------------------------------------------------------------------------------------
1 ----------------------------------------------------------------------------------------------------
2 ----------------------------------------------------------------------------------------------------
3 ----------------------------------------------------------------------------------------------------


(tensor([2.4615], grad_fn=<MulBackward0>),
 tensor([0.1696, 0.9511, 0.1101, 1.2307]))

In [124]:
torch.cat((box_loss, objectness_loss, class_loss, loss)).detach()

tensor([3.3916, 1.3587, 0.3669, 1.2307])

In [103]:
        self.object_positive_weight=torch.tensor([obj_pw], device=device)
        # self.BCEcls=nn.BCEWithLogitsLoss(pos_weight=torch.tensor([cls_pw], device=device))
        # self.BCEobj=nn.BCEWithLogitsLoss(pos_weight=torch.tensor([obj_pw], device=device))

(torch.Size([1, 80]), torch.int64)

In [112]:


multilabel_classification_loss(predictions=positive_pred_l[:,5:], target_class_indices=targets[level][:,1].long(), 
                               pos_weight=loss_module.class_positive_weight, pos_value=loss_module.positive_class, 
                               neg_value=loss_module.negative_class)

tensor(0.0815, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [113]:
class_loss

tensor([0.0815], grad_fn=<AddBackward0>)

In [98]:
        self.class_positive_weight=cls_pw
        self.object_positive_weight=obj_pw

nn.BCEWithLogitsLoss(pos_weight=torch.tensor([cls_pw], device=device))
torch.nn.functional.binary_cross_entropy_with_logits(input, target, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None)

True True tensor(0.6744, grad_fn=<DivBackward0>) tensor([0.6744], grad_fn=<AddBackward0>)


In [99]:
            # here we do not need to multiply p_obj and p_cls since we only compute the classification loss
            # for positive samples (objects exist)
            target_class_indices=targets[level][:,1].long() # n_targets
            # n_targets x n_classes
            target_classes=torch.full_like(positive_pred_l[:,5:], loss_module.negative_class, device=device)
            target_classes[range(n_targets), target_class_indices]=loss_module.positive_class # one-hot
            class_loss+=loss_module.BCEcls(positive_pred_l[:,5:], target_classes)

True True tensor(0.2522, grad_fn=<DivBackward0>)
