In [1]:
%matplotlib inline 
import cv2, os, ast, time, math, shutil, sys, glob, re
from datetime import datetime
from PIL import Image
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas() 
from joblib import Parallel, delayed
from scipy import stats

import torch
import torchvision
from torchvision.ops import box_iou
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from matplotlib.image import imread
from typing import List
from mAP import mean_average_precision 
from torchinfo import summary
from torchvision.models.detection import retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection import FasterRCNN


In [2]:
%cd yolov7
cwd = os.getcwd()

h:\Python\ob\train_models\YOLO\yolov7


In [3]:
base_dir = 'h:/Python/ob/train_models'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
ckpt_path_model_1 = base_dir + '/YOLO/runs_yolov7_1280_720/weights/best.pt'
ckpt_path_model_2 = base_dir + '/FasterRCNN/fasterrcnn_resnet50_fpn_free_last2_batch3_e12.bin'

In [4]:
from models.experimental import attempt_load 
from utils.datasets import LoadImages
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression
model_1 = attempt_load(ckpt_path_model_1, map_location=device)
model_1.eval()

'''
model_1 = torchvision.models.detection.fasterrcnn_resnet50_fpn()
in_features = model_1.roi_heads.box_predictor.cls_score.in_features
model_1.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)

model_1.load_state_dict(torch.load(ckpt_path_model_1))
model_1.to(device)
model_1.eval()
'''

model_2 = torchvision.models.detection.fasterrcnn_resnet50_fpn()
in_features = model_2.roi_heads.box_predictor.cls_score.in_features
model_2.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)

model_2.load_state_dict(torch.load(ckpt_path_model_2))
model_2.to(device)
model_2.eval()

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=1e-05)
          (relu

In [5]:
def detect_cots(img_path, model, img_size, stride, device): 
    
    dataset = LoadImages(img_path, img_size=img_size, stride=stride)

    for path, img, im0s, vid_cap in dataset:
        img = torch.from_numpy(img).to(device).float()
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)
    # Inference
        #print(im0s)
        #print(img[:, 2, 100, 500])
        t0 = time.time()
        pred = model(img, augment=None)[0]
        pred = non_max_suppression(pred) #opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
        inference_time = time.time() - t0
        del img
        torch.cuda.empty_cache()
    return pred, inference_time

def yolo2xy(bboxes, height=720, width=1280):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    coco => [xmin, ymin, w, h]
    
    """ 
    # denormalizing
    bboxes[..., 0::2] *= width
    bboxes[..., 1::2] *= height
    
    # converstion (xmid, ymid) => (xmin, ymin) 
    bboxes[..., 0:2] -= bboxes[..., 2:4]/2
    bboxes[:,[2]] += bboxes[:,[0]]
    bboxes[:,[3]] += bboxes[:,[1]]
    
    return bboxes

In [6]:
# paired t-test on detection speed 
df_val = pd.read_csv('./cots/val.txt', header=None, names=['img_path'])
df_train = pd.read_csv('./cots/train.txt', header=None, names=['img_path'])
col_names={0:'class', 1:'x_mid', 2:'y_mid', 3:'w_ratio', 4:'h_ratio'}

In [7]:
def calculate_score(preds: List[torch.Tensor], gts: List[torch.Tensor], iou_th: float) -> float:
    num_tp = 0
    num_fp = 0
    num_fn = 0
    for p, GT in zip(preds, gts):
        if len(p) and len(GT):
            gt = GT.clone()
            gt[:, 2] = gt[:, 0] + gt[:, 2]
            gt[:, 3] = gt[:, 1] + gt[:, 3]
            pp = p.clone()
            pp[:, 2] = pp[:, 0] + pp[:, 2]
            pp[:, 3] = pp[:, 1] + pp[:, 3]
            iou_matrix = box_iou(pp, gt)
            tp = len(torch.where(iou_matrix.max(0)[0] >= iou_th)[0])
            fp = len(p) - tp
            fn = len(torch.where(iou_matrix.max(0)[0] < iou_th)[0])
            num_tp += tp
            num_fp += fp
            num_fn += fn
        elif len(p) == 0 and len(GT):
            num_fn += len(GT)
        elif len(p) and len(GT) == 0:
            num_fp += len(p)
    score = 5 * num_tp / (5 * num_tp + 4 * num_fn + num_fp)
    return score

In [8]:
# F2 score on training data and test data for model_1

infer_time_train_1 = {}
f2_value_train_1 = {}
iou_ths = np.arange(0.3, 0.85, 0.05)
time_begin = time.time()

with torch.no_grad():
    for i in range(df_train.shape[0]):
        preds = []
        gts = []
        img_path = df_train.img_path[i]
        img_id = re.search('(\d-.+?).jpg', img_path).group(1)
        label_path = img_path.replace('jpg', 'txt')
        label_path = label_path.replace('images', 'labels')
        pred, infer_time = detect_cots(img_path, model=model_1, img_size=1280, stride=32, device=device) 
        pred = pred[0].cpu().detach()
        pred_box = pred[:, 0:4]
        preds.append(pred_box)
        infer_time_train_1[img_id] = infer_time
        bbox = pd.read_csv(label_path, sep=' ', header=None)
        bbox = bbox.rename(columns=col_names)
        bbox = bbox[['x_mid', 'y_mid', 'w_ratio', 'h_ratio']]
        gt_box = yolo2xy(bbox.to_numpy()).astype(np.int32)
        gts.append(torch.tensor(gt_box))
        scores = np.mean([calculate_score(preds, gts, iou_th) for iou_th in iou_ths])
        f2_value_train_1[img_id] = scores
        del pred
        torch.cuda.empty_cache()

print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))

infer_time_val_1 = {}
f2_value_val_1 = {}
time_begin = time.time()

with torch.no_grad():
    for i in range(df_val.shape[0]):
        preds = []
        gts = []
        img_path = df_val.img_path[i]
        img_id = re.search('(\d-.+?).jpg', img_path).group(1)
        label_path = img_path.replace('jpg', 'txt')
        label_path = label_path.replace('images', 'labels')
        pred, infer_time = detect_cots(img_path, model=model_1, img_size=1280, stride=32, device=device) 
        pred = pred[0].cpu().detach()
        pred_box = pred[:, 0:4]
        preds.append(pred_box)
        infer_time_val_1[img_id] = infer_time
        bbox = pd.read_csv(label_path, sep=' ', header=None)
        bbox = bbox.rename(columns=col_names)
        bbox = bbox[['x_mid', 'y_mid', 'w_ratio', 'h_ratio']]
        gt_box = yolo2xy(bbox.to_numpy()).astype(np.int32)
        gts.append(torch.tensor(gt_box))
        scores = np.mean([calculate_score(preds, gts, iou_th) for iou_th in iou_ths])
        f2_value_val_1[img_id] = scores
        del pred
        torch.cuda.empty_cache()

print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Inference Time Used: 780.95 seconds
Inference Time Used: 84.16 seconds


In [9]:
base_dir = 'h:\Python\ob'
train_csv = os.path.join(base_dir, "train.csv")
train_df = pd.read_csv(train_csv)
train_df["img_path"] = os.path.join(base_dir, "train_images") + "/video_" + train_df.video_id.astype(str) + "/" + train_df.video_frame.astype(str) + ".jpg"
train_df["annotations"] = train_df["annotations"].apply(eval)
train_df["a_count"] = train_df["annotations"].apply(len)
train_df = train_df.drop(columns=['video_id', 'sequence', 'video_frame', 'sequence_frame'])
train_df_positive = train_df[train_df['a_count'] != 0]
train_df_positive= train_df_positive.reset_index(drop=True)

train_df_ratio = (train_df_positive.set_index('image_id').explode('annotations').
                  apply(lambda row: pd.Series(row['annotations']), axis=1).reset_index())
train_df_ratio['aspect_ratio'] = train_df_ratio['height']/train_df_ratio['width']


train_df_p, val_df_p = train_test_split(train_df_positive, test_size=0.1, random_state=0)

class COTS_Dataset(torch.utils.data.Dataset):
    def __init__(self, df_img, df_bbox, original_size=(1280, 720), resize_size=(1280, 720)):
        self.df_img = df_img
        self.df_bbox = df_bbox
        self.orginal_size = original_size
        self.resize_size = resize_size
        
    def __getitem__(self, idx):
        # load images and masks
        row = self.df_img.iloc[idx]
        img = Image.open(row['img_path']).convert('RGB')
        if True: 
            img = np.array(img)/255.
        else:
            img = np.array(img.resize((self.w, self.h), resample=Image.BILINEAR))/255.
        data = self.df_bbox[self.df_bbox['image_id'] == row['image_id']]
        labels = ['cots'] * data.shape[0]
        data = data[['x','y','width','height']].values
        area = data[:, 2] * data[:, 3]
        data[:,[2]] += data[:,[0]]
        data[:,[3]] += data[:,[1]]
        boxes = data.astype(np.uint32).tolist() # convert to absolute coordinates
        # torch FRCNN expects ground truths as a dictionary of tensors
        iscrowd = torch.zeros((data.shape[0],), dtype=torch.int64)
        target = {}
        target["boxes"] = torch.Tensor(boxes).float()
        target["labels"] = torch.Tensor([1 for i in labels]).long()
        target["image_id"] = row['image_id']
        target["area"] = torch.Tensor(area).float()
        target["iscrowd"] = iscrowd

        img = torch.tensor(img).permute(2,0,1)
        return img.to(device).float(), target
    
    def collate_fn(self, batch):
        return tuple(zip(*batch)) 

    def __len__(self):
        return self.df_img.shape[0]

train_ds = COTS_Dataset(train_df_p, train_df_ratio)
test_ds = COTS_Dataset(val_df_p, train_df_ratio)

train_loader = DataLoader(train_ds, batch_size=1, collate_fn=train_ds.collate_fn, drop_last=False)
test_loader = DataLoader(test_ds, batch_size=1, collate_fn=test_ds.collate_fn, drop_last=False)

In [10]:
'''
# F2 score on training data and test data for model_1, if both are Faster RCNN
infer_time_train_1 = {}
f2_value_train_1 = {}
iou_ths = np.arange(0.3, 0.85, 0.05)

time_begin = time.time()
for ix, (image, targets) in enumerate(train_loader):
    preds = []
    gts = []
    t0 = time.time() 
    outputs = model_1(image)[0]
    infe_time = time.time() - t0 
    infer_time_train_1[targets[0]['image_id']] = infe_time
    gts.append(targets[0]['boxes']) 
    preds.append(outputs['boxes'].cpu().detach())
    scores = [calculate_score(preds, gts, iou_th) for iou_th in iou_ths]
    f2_value_train_1[targets[0]['image_id']] = np.mean(scores)
    
print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))

infer_time_val_1 = {}
f2_value_val_1 = {}

time_begin = time.time()
for ix, (image, targets) in enumerate(test_loader):
    preds = []
    gts = []
    t0 = time.time() 
    outputs = model_1(image)[0]
    infe_time = time.time() - t0 
    infer_time_val_1[targets[0]['image_id']] = infe_time
    gts.append(targets[0]['boxes']) 
    preds.append(outputs['boxes'].cpu().detach())
    scores = [calculate_score(preds, gts, iou_th) for iou_th in iou_ths]
    f2_value_val_1[targets[0]['image_id']] = np.mean(scores)
    
print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))
'''

"\n# F2 score on training data and test data for model_1, if both are Faster RCNN\ninfer_time_train_1 = {}\nf2_value_train_1 = {}\niou_ths = np.arange(0.3, 0.85, 0.05)\n\ntime_begin = time.time()\nfor ix, (image, targets) in enumerate(train_loader):\n    preds = []\n    gts = []\n    t0 = time.time() \n    outputs = model_1(image)[0]\n    infe_time = time.time() - t0 \n    infer_time_train_1[targets[0]['image_id']] = infe_time\n    gts.append(targets[0]['boxes']) \n    preds.append(outputs['boxes'].cpu().detach())\n    scores = [calculate_score(preds, gts, iou_th) for iou_th in iou_ths]\n    f2_value_train_1[targets[0]['image_id']] = np.mean(scores)\n    \nprint('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))\n\ninfer_time_val_1 = {}\nf2_value_val_1 = {}\n\ntime_begin = time.time()\nfor ix, (image, targets) in enumerate(test_loader):\n    preds = []\n    gts = []\n    t0 = time.time() \n    outputs = model_1(image)[0]\n    infe_time = time.time() - t0 \n    infer

In [11]:
# F2 score on training data and test data for model_2
infer_time_train_2 = {}
f2_value_train_2 = {}

time_begin = time.time()
for ix, (image, targets) in enumerate(train_loader):
    preds = []
    gts = []
    t0 = time.time() 
    outputs = model_2(image)[0]
    infe_time = time.time() - t0 
    infer_time_train_2[targets[0]['image_id']] = infe_time
    gts.append(targets[0]['boxes']) 
    preds.append(outputs['boxes'].cpu().detach())
    scores = [calculate_score(preds, gts, iou_th) for iou_th in iou_ths]
    f2_value_train_2[targets[0]['image_id']] = np.mean(scores)
    
print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))


infer_time_val_2 = {}
f2_value_val_2 = {}

time_begin = time.time()
for ix, (image, targets) in enumerate(test_loader):
    preds = []
    gts = []
    t0 = time.time() 
    outputs = model_2(image)[0]
    infe_time = time.time() - t0 
    infer_time_val_2[targets[0]['image_id']] = infe_time
    gts.append(targets[0]['boxes']) 
    preds.append(outputs['boxes'].cpu().detach())
    scores = [calculate_score(preds, gts, iou_th) for iou_th in iou_ths]
    f2_value_val_2[targets[0]['image_id']] = np.mean(scores)
    
print('Inference Time Used: {:.2f} seconds'.format(time.time()- time_begin))

Inference Time Used: 846.48 seconds
Inference Time Used: 93.36 seconds


In [15]:
# paired t-test on detection speed and F2 score 
df_t_test_train = pd.DataFrame([infer_time_train_1, infer_time_train_2, f2_value_train_1, f2_value_train_2]).T
col_name = ['Inference_time_model_1', 'Inference_time_model_2', 'f2_model_1', 'f2_model_2']
df_t_test_train.columns = col_name

df_t_test_val = pd.DataFrame([infer_time_val_1, infer_time_val_2, f2_value_val_1, f2_value_val_2]).T
col_name = ['Inference_time_model_1', 'Inference_time_model_2', 'f2_model_1', 'f2_model_2']
df_t_test_val.columns = col_name

print('Paired t-test for training set:')
print(stats.ttest_rel(df_t_test_train['Inference_time_model_1'], 
                      df_t_test_train['Inference_time_model_2'], 
                      nan_policy='omit', alternative='less')) 

print(stats.ttest_rel(df_t_test_train['f2_model_1'], df_t_test_train['f2_model_2'], 
                      nan_policy='omit', alternative='less'))

print('Paired t-test for test set:')
print(stats.ttest_rel(df_t_test_val['Inference_time_model_1'], 
                      df_t_test_val['Inference_time_model_2'], 
                      nan_policy='omit', alternative='less')) 

print(stats.ttest_rel(df_t_test_val['f2_model_1'], df_t_test_val['f2_model_2'], 
                      nan_policy='omit', alternative='less'))

Paired t-test for training set:
Ttest_relResult(statistic=-156.216680882234, pvalue=0.0)
Ttest_relResult(statistic=-24.33555268018417, pvalue=3.532817788643768e-123)
Paired t-test for test set:
Ttest_relResult(statistic=-81.651962590752, pvalue=3.793314724480224e-288)
Ttest_relResult(statistic=-6.887899320151636, pvalue=8.70197956455147e-12)


In [13]:
print(df_t_test_train.mean()) 
print('-' * 50)
print(df_t_test_val.mean())

Inference_time_model_1    0.056338
Inference_time_model_2    0.094270
f2_model_1                0.900007
f2_model_2                0.979974
dtype: float64
--------------------------------------------------
Inference_time_model_1    0.056696
Inference_time_model_2    0.094454
f2_model_1                0.887822
f2_model_2                0.960163
dtype: float64


In [14]:
print(df_t_test_train.head(8).iloc[:, 0:4].to_latex(index=True))

\begin{tabular}{lrrrr}
\toprule
{} &  Inference\_time\_model\_1 &  Inference\_time\_model\_2 &  f2\_model\_1 &  f2\_model\_2 \\
\midrule
1-9322 &                0.911984 &                0.093620 &    0.909091 &    1.000000 \\
1-8839 &                0.065822 &                0.091413 &    0.714286 &    0.937500 \\
0-4237 &                0.065822 &                0.094010 &    1.000000 &    1.000000 \\
0-78   &                0.061711 &                0.093529 &    1.000000 &    1.000000 \\
0-9832 &                0.075787 &                0.093728 &    1.000000 &    1.000000 \\
1-4256 &                0.068812 &                0.093726 &    1.000000 &    1.000000 \\
1-5853 &                0.075995 &                0.093734 &    1.000000 &    1.000000 \\
1-9122 &                0.067844 &                0.093991 &    0.975758 &    0.986842 \\
\bottomrule
\end{tabular}



  print(df_t_test_train.head(8).iloc[:, 0:4].to_latex(index=True))
