## Importuri

In [3]:
import os
import cv2
import numpy as np
from glob import glob

import torch
from yolov5.models.experimental import attempt_load
from yolov5.utils.general import non_max_suppression, scale_boxes
from yolov5.utils.torch_utils import select_device

from deep_sort.deep_sort import nn_matching                   # Calculul distanÈ›elor Ã®ntre caracteristici
from deep_sort.deep_sort.tracker import Tracker               # Gestionarea obiectelor urmÄƒrite
from deep_sort.deep_sort.detection import Detection           # Definirea detecÈ›iilor
from deep_sort.tools import generate_detections as gdet       # ExtracÈ›ia caracteristicilor CNN

## Variabile utile

In [4]:
# Calea cÄƒtre videoclipul de intrare
path_video_input = './data_and_labels/data/Test/set07/set07/V000.seq'

# Calea cÄƒtre videoclipul de iesire
path_video_output = './result/result.avi'

# Folderul pentru cadrele extrase
path_output_frames = './result/frames/'

# Formatul frame-urilor
frame_format = 'jpg'

# Frame Rate
fps = 30

## Salvarea fiecarui frame

#### Se ruleaza doar daca se doreste extragerea frame-urilor

In [5]:
video = cv2.VideoCapture(path_video_input)
counter = 0

while True:
    existed, frame = video.read()
    if not existed:
        break
                
    # Salvarea imaginii
    cv2.imwrite(os.path.join(path_output_frames, f'{counter}.{frame_format}'), frame)
                
    counter += 1

## Configurare YOLOv5n

In [6]:
# GPU
device = select_device('0')

# GreutÄƒÈ›i antrenate personalizate
weights = './yolov5/runs/train/exp/weights/best.pt'

YOLOv5 ðŸš€ v7.0-388-g882c35fc Python-3.8.15 torch-2.4.1+cu121 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 3799MiB)



In [7]:
# ÃŽncarcÄƒ modelul YOLOv5
model = attempt_load(weights, device=device)

# SeteazÄƒ modelul Ã®n modul de evaluare
model.eval()

Fusing layers... 
YOLOv5n summary: 157 layers, 1761871 parameters, 0 gradients, 4.1 GFLOPs


DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 16, kernel_size=(6, 6), stride=(2, 2), padding=(2, 2))
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (act): SiLU(inplace=True)
    )
    (2): C3(
      (cv1): Conv(
        (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (cv3): Conv(
        (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
        (act): SiLU(inplace=True)
      )
      (m): Sequential(
        (0): Bottleneck(
          (cv1): Conv(
            (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
            (act): SiLU(inplace=True)
          )
          (cv2): Conv(
            (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  

## Configurare DeepSORT

In [8]:
# Calea catre weight-urile CNN-ului
path_weights_CNN = "./deep_sort/resources/networks/mars-small128.pb"

# IniÈ›ializeazÄƒ extractorul de caracteristici
encoder = gdet.create_box_encoder(path_weights_CNN, batch_size=1)

# SeteazÄƒ parametrii trackerului
metric = nn_matching.NearestNeighborDistanceMetric("cosine", 0.4, None)
tracker = Tracker(metric)


## Functie care aplica YOLOv5n pe frame-ul curent

In [9]:
# FuncÈ›ie pentru detectare
def detect(frame):
    # Convertire BGR -> RGB
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Redimensionare pentru model
    img = cv2.resize(img, (640, 640))
    
    # Rearanjare pentru Torch (C, H, W)
    img = img.transpose(2, 0, 1)
    
    # Optimizare memorie
    img = np.ascontiguousarray(img)
    
    # Normalizare [0,1]
    img = torch.from_numpy(img).to(device).float() / 255.0
    
    # AdaugÄƒ dimensiune batch
    img = img.unsqueeze(0)

    # AplicÄƒ modelul pentru predicÈ›ii
    pred = model(img)[0]

    # Filtrare prin praguri confidence È™i IoU
    pred = non_max_suppression(pred, 0.2, 0.3)
    
    return pred

# MAIN

#### - Se deschide secventa video si se extrage fiecare frame
#### - Se aplica YOLOv5n
#### - Se convertesc datele astfel incat sa se poata extrage trasaturile cu CNN (parte din DeepSORT)
#### - Apoi se pun la un loc bounding box-urile, confidenta si trasaturile extrase si se aplica Tracker-ul (din DeepSORT)
#### - Se deseneaza bounding box-urile si ID-urile persoanelor identificate
#### - Se creaza un videoclip in format .avi cu rezultatul obtinut (stocat in folderul ./result/)

In [10]:
video = cv2.VideoCapture(path_video_input)
counter = 0

while True:
    # Extrage frame-ul
    existed, frame = video.read()
    if not existed:
        break

    # DetecteazÄƒ obiectele Ã®n frame-ul curent cu YOLOv5n
    predictions = detect(frame)

    # DeseneazÄƒ  pentru detecÈ›iile de persoane
    for det in predictions:
        if det is not None and len(det):
            det[:, :4] = scale_boxes((640, 640), det[:, :4], frame.shape).round()

            bbox_xywh = []
            confidences = []
            
            for *xyxy, conf, cls in det:
                if int(cls) == 0:  # Clasa 0 pentru persoane
                    #cv2.rectangle(frame, (int(xyxy[0]), int(xyxy[1])), (int(xyxy[2]), int(xyxy[3])), (0, 255, 0), 2)
                    #cv2.putText(frame, f'{conf:.2f}', (int(xyxy[0]), int(xyxy[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

                    # YOLO creaza bounding box-ul in formatul (x_top_left, y_top_left, x_bottom_right, y_bottom_right)
                    x1, y1, x2, y2 = xyxy

                    # EliminÄƒ bounding box-urile cu dimensiuni zero
                    if (x2 - x1) == 0 or (y2 - y1) == 0:
                        continue

                    # Este necesara mutarea acestor date pe CPU
                    x1 = x1.cpu()
                    y1 = y1.cpu()
                    x2 = x2.cpu()
                    y2 = y2.cpu()

                    # Formatul acceptat de DeepSORT este (x_top_left, y_top_left, weight, height)
                    #bbox = [(x1+x2)/2, (y1+y2)/2, x2-x1, y2-y1]
                    bbox = [x1, y1, x2-x1, y2-y1]
                    bbox_xywh.append(bbox)

                    # Nivelul de confidenta
                    confidences.append(conf.item())
            
            # Extragem caracteristicile cu CNN
            features = encoder(frame, bbox_xywh)
            detections = [Detection(bbox, conf, feature) for bbox, conf, feature in zip(bbox_xywh, confidences, features)]

            # Actualizarea trackerul
            tracker.predict()
            tracker.update(detections)

            # DeseneazÄƒ bounding box-urile È™i ID-urile
            for track in tracker.tracks:
                if not track.is_confirmed() or track.time_since_update > 1:
                    continue

                # Convertirea din (x_top_left, y_top_left, weight, height) in (x_top_left, y_top_left, x_bottom_right, y_bottom_right)
                bbox = track.to_tlbr()

                # ID-ul persoanei identificate
                track_id = track.track_id

                # Desenarea bounding box-ului
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 255, 0), 2)
                
                # Desenarea ID-ului
                cv2.putText(frame, f'ID {track_id}', (int(bbox[0]), int(bbox[1]) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Salvam imaginea
    cv2.imwrite(os.path.join(path_output_frames, f'{counter}.{frame_format}'), frame)
                
    counter += 1

## Fisierul video rezultat

In [11]:
# Lista imaginilor
aux = glob(f'{path_output_frames}*.jpg')
nr_img = len(aux)

images = []
for nr in range(nr_img):
    images.append(os.path.join(path_output_frames, f"{nr}.jpg"))

# CiteÈ™te primul frame pentru a obÈ›ine dimensiunile
frame = cv2.imread(images[0])
height, width, layers = frame.shape

# IniÈ›ializeazÄƒ scriitorul video
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # Codec video
video = cv2.VideoWriter(path_video_output, fourcc, fps, (width, height))

# AdaugÄƒ fiecare frame Ã®n videoclip
video.write(frame)
for image in images:
    frame = cv2.imread(image)
    video.write(frame)

# ElibereazÄƒ resursele
video.release()
cv2.destroyAllWindows()