## Import

In [1]:
import cv2 as cv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
from tqdm.auto import tqdm

In [2]:
import sys
sys.path.append('/home/sardarchitect/repos/github.com/ucsd_capstone/')
sys.path.append('/home/sardarchitect/repos/github.com/ucsd_capstone/streetstudy/')

In [3]:
from streetstudy.data import virat
from streetstudy.model import yolo

## Hyperparameters

In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 32

In [5]:
DEVICE

'cuda'

## Data Ingestion

In [6]:
video_df = virat.build()
video_df.head()

Unnamed: 0_level_0,path,num_frames,duration,event_file,object_file,mapping_file
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
VIRAT_S_010200_08_000838_000867,/home/sardarchitect/repos/github.com/ucsd_caps...,532,22,VIRAT_S_010200_08_000838_000867.viratdata.even...,VIRAT_S_010200_08_000838_000867.viratdata.obje...,VIRAT_S_010200_08_000838_000867.viratdata.mapp...
VIRAT_S_000200_03_000657_000899,/home/sardarchitect/repos/github.com/ucsd_caps...,7243,241,VIRAT_S_000200_03_000657_000899.viratdata.even...,VIRAT_S_000200_03_000657_000899.viratdata.obje...,VIRAT_S_000200_03_000657_000899.viratdata.mapp...
VIRAT_S_050000_08_001235_001295,/home/sardarchitect/repos/github.com/ucsd_caps...,1792,59,VIRAT_S_050000_08_001235_001295.viratdata.even...,VIRAT_S_050000_08_001235_001295.viratdata.obje...,VIRAT_S_050000_08_001235_001295.viratdata.mapp...
VIRAT_S_050000_06_000908_000970,/home/sardarchitect/repos/github.com/ucsd_caps...,1855,61,VIRAT_S_050000_06_000908_000970.viratdata.even...,VIRAT_S_050000_06_000908_000970.viratdata.obje...,VIRAT_S_050000_06_000908_000970.viratdata.mapp...
VIRAT_S_000207_04_000902_000934,/home/sardarchitect/repos/github.com/ucsd_caps...,938,31,VIRAT_S_000207_04_000902_000934.viratdata.even...,VIRAT_S_000207_04_000902_000934.viratdata.obje...,VIRAT_S_000207_04_000902_000934.viratdata.mapp...


In [7]:
current_video = video_df.loc["VIRAT_S_000002"]
current_video

path            /home/sardarchitect/repos/github.com/ucsd_caps...
num_frames                                                   9075
duration                                                      302
event_file                    VIRAT_S_000002.viratdata.events.txt
object_file                  VIRAT_S_000002.viratdata.objects.txt
mapping_file                 VIRAT_S_000002.viratdata.mapping.txt
Name: VIRAT_S_000002, dtype: object

In [8]:
annotations_df = virat.get_annotations(current_video['path'])
annotations_df = annotations_df[annotations_df['object_type'] == 1]
annotations_df.head()

Unnamed: 0,object_id,object_duration,current_frame,bbox_lefttop_x,bbox_lefttop_y,bbox_width,bbox_height,object_type
0,2,9076,0,1262,381,53,116,1
1,2,9076,1,1261,381,53,116,1
2,2,9076,2,1260,381,53,116,1
3,2,9076,3,1259,381,53,116,1
4,2,9076,4,1258,381,53,116,1


## Model Architecture

In [9]:
clf = yolo.yolov5()
clf.conf = 0
clf.classes = [0]

Using cache found in /home/sardarchitect/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-4-30 Python-3.11.3 torch-2.0.0+cu117 CUDA:0 (NVIDIA GeForce GTX 1650, 4096MiB)



[31m[1mrequirements:[0m /home/sardarchitect/.cache/torch/hub/requirements.txt not found, check failed.


Fusing layers... 
YOLOv5s summary: 166 layers, 7053910 parameters, 0 gradients
Adding AutoShape... 


## Evaluation Criteria

In [10]:
def bbox_iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
    intersection_width = xB - xA 
    intersection_height = yB - yA
    
    if intersection_width <= 0 or intersection_height <= 0:
        return 0
    
    intersection_area = intersection_width * intersection_height
    boxA_area = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxB_area = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    
    iou = intersection_area / float(boxA_area + boxB_area - intersection_area)
    return iou   

# TEST
boxA = [0,0,10,10]
boxB = [10,10,20,20]
print(bbox_iou(boxA, boxB))

boxA = [10,10,20,20]
boxB = [0,0,10,10]
print(bbox_iou(boxA, boxB))

boxA = [0,0,10,10]
boxB = [5,5,10,10]
print(bbox_iou(boxA, boxB))

boxA = [5,5,10,10]
boxB = [0,0,10,10]
print(bbox_iou(boxA, boxB))

boxA = [0,0,10,10]
boxB = [0,0,10,10]
print(bbox_iou(boxA, boxB))

boxA = torch.tensor([5,5,10,10])
boxB = torch.tensor([0,0,10,10])
print(bbox_iou(boxA, boxB))

0
0
0.25
0.25
1.0
tensor(0.25000)


In [11]:
import time

In [12]:
tic = time.perf_counter()

boxA = torch.tensor([5,5,10,10])
boxB = torch.tensor([0,0,10,10])
bbox_iou(boxA, boxB)

toc = time.perf_counter()
toc-tic

0.001295997000852367

In [13]:
import scipy.optimize

def match_bboxes(bbox_gt, bbox_pred, IOU_THRESH=0.01):
    '''
    Given sets of ground truth and predicted bounding boxes,
    determine best possible match.
    '''
    num_gt = bbox_gt.shape[0]
    num_pred = bbox_pred.shape[0]
    MAX_DIST = 1.0
    MIN_IOU = 0.0
    
    iou_matrix = np.zeros((num_gt, num_pred))
    
    for i in range(num_gt):
        for j in range(num_pred):
            iou_matrix[i, j] = bbox_iou(bbox_gt[i], bbox_pred[j])
    
    if num_pred > num_gt:
        diff = num_pred - num_gt
        iou_matrix = np.concatenate((iou_matrix, np.full((diff, num_pred), MIN_IOU)), axis=0)
        
    if num_gt > num_pred:
        diff = num_gt - num_pred
        iou_matrix = np.concatenate((iou_matrix, np.full((num_gt, diff), MIN_IOU)), axis=1)
        
    idxs_gt, idxs_pred = scipy.optimize.linear_sum_assignment(1 - iou_matrix)
    if (not idxs_gt.size) or (not idxs_pred.size):
        ious = np.array([])
    else:
        ious = iou_matrix[idxs_gt, idxs_pred]
        
    sel_pred = idxs_pred < num_pred
    idx_pred_actual = idxs_pred[sel_pred]
    idx_gt_actual = idxs_gt[sel_pred]
    ious_actual = iou_matrix[idx_gt_actual, idx_pred_actual]
    sel_valid = (ious_actual > IOU_THRESH)
    label = sel_valid.astype(int)
    
    return idx_gt_actual[sel_valid], idx_pred_actual[sel_valid], ious_actual[sel_valid], label

# TEST
bbox_gt = np.array([[0,0,5,5], [10,10,25,25], [40,40,65,65]])
bbox_pred = np.array([[0,0,4,5], [35,35,70,70], [0,0,1,1], [10,10,26,20]])
ap = match_bboxes(bbox_gt, bbox_pred)

In [14]:
bbox_gt = np.array([[0,0,5,5], [10,10,25,25], [40,40,65,65]])
bbox_pred = np.array([[0,0,0,0]])
match_bboxes(bbox_gt, bbox_pred)

(array([], dtype=int64),
 array([], dtype=int64),
 array([], dtype=float64),
 array([0]))

In [15]:
# mAP = []
# num_frames = int(current_video['num_frames'])
# capture = cv.VideoCapture(current_video['path'])

# for current_frame in tqdm(range(num_frames)):
#     ret, frame = capture.read()
#     preds = clf(frame)
    
#     bbox_pred = (preds.xyxy[0][:, :4]).cpu().numpy()
#     bbox_gt = annotations_df[annotations_df['current_frame'] == current_frame].to_numpy()[:,3:7]
#     bbox_gt[:, 2] = bbox_gt[:, 0] + bbox_gt[:, 2]
#     bbox_gt[:, 3] = bbox_gt[:, 1] + bbox_gt[:, 3]

#     mAP.append(match_bboxes(bbox_gt, bbox_pred)[3])
# #     break
    
# capture.release()

# TP = 0
# TPFP = 0

# for i in mAP:
#     TPFP += len(i)
#     TP += sum(i)
# average_precision = TP/TPFP    
# print("Average Precision:", average_precision)

## Optimization

## Training Loop

## Testing Loop

In [16]:
video_df.sort_values('num_frames', inplace=True)

In [19]:
TOTAL_AP = []

total_videos = 20
current_video_number = 0

for video_idx in tqdm(range(total_videos)):
    
    current_video = video_df.iloc[video_idx]    
    annotations_df = virat.get_annotations(current_video['path'])
    annotations_df = annotations_df[annotations_df['object_type'] == 1]
        
    num_frames = int(current_video['num_frames'])
    AP = []
    
    capture = cv.VideoCapture(current_video['path'])
    for current_frame in tqdm(range(num_frames)):

        _, frame = capture.read()
        preds = clf(frame)

        bbox_pred = (preds.xyxy[0][:, :4]).cpu().numpy()
        bbox_gt = (annotations_df[annotations_df['current_frame'] == current_frame].to_numpy()[:,3:7])
        bbox_gt[:, 2] = bbox_gt[:, 0] + bbox_gt[:, 2]
        bbox_gt[:, 3] = bbox_gt[:, 1] + bbox_gt[:, 3]
        
        AP.append(match_bboxes(bbox_gt, bbox_pred)[3])

    capture.release()

    TP = 0
    TPFP = 0
    for i in AP:
        TPFP += len(i)
        TP += sum(i)
    
    if TPFP == 0:
        average_precision = 0
    else:
        average_precision = TP/TPFP
    
    TOTAL_AP.append(average_precision)
    print("Average Precision:", average_precision)
    
sum(TOTAL_AP) / total_videos

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/362 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
preds.pandas()

YOLOv5 <class 'models.common.Detections'> instance
image 1/1: 720x1280 (no detections)
Speed: 1.1ms pre-process, 13.2ms inference, 1.6ms NMS per image at shape (1, 3, 384, 640)

## Model Export