## Documentation:-

Tracking is process of identifying the positions of objects throughout multiple sequence of photos (i.e., video), tracking is getting the initial set of detections, assigning unique ids, and tracking them throughout frames of the video feed while maintaining the assigned ids. It's a 2 step process:-

1. Detection and localization of the object in the frame using some object detector like YOLOv8, CenterNet, etc.
2. Predicting the future motion of the object using its past information using a tracking algorithm.

The deep_sort folder in the repo has the original deep sort implementation, complete with the Kalman filter, hungarian algorithm and feature extractor. But the original repo is built only for validating the algorithm with the MARS test dataset. So, we have written a custom class Tracker.py for ourself utilizing the original repo.

Made Changes in generate_detections.py mudule to support latest version of tendsorflow.
1. "np.int" is changed to "int"

2.      self.session = tf.Session()
        with tf.gfile.GFile(checkpoint_filename, "rb") as file_handle:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(file_handle.read())
        tf.import_graph_def(graph_def, name="net")
        self.input_var = tf.get_default_graph().get_tensor_by_name(
            "net/%s:0" % input_name)
        self.output_var = tf.get_default_graph().get_tensor_by_name(
            "net/%s:0" % output_name)

    changed to:

        self.session = tf.compat.v1.Session()
        with tf.compat.v1.gfile.GFile(checkpoint_filename, "rb") as file_handle:
            graph_def = tf.compat.v1.GraphDef()
            graph_def.ParseFromString(file_handle.read())
        tf.import_graph_def(graph_def, name="net")
        self.input_var = tf.compat.v1.get_default_graph().get_tensor_by_name(
            "%s:0" % input_name)
        self.output_var = tf.compat.v1.get_default_graph().get_tensor_by_name(
            "%s:0" % output_name)

Objective: Develop a computer vision solution to monitor and track people at an airport for security and operational efficiency.

The solution uses yolov8n-pose for object detection and extraction of bone joint coordinates, it also uses deepSORT for tracking objects throughout the frames.
Once we have the object detected and tracked we try to figure out whether he's a threat or not by estimating his hand position. Assuming that if his hand is held high and extended 
then he might be holding an object and pointing it at some direction which maybe a gun or knife, etc.
Based on this assumption the code tries to figure out an extend arm position using coordinates from wrists, elbows and shoulder joints.
We can further implement a threshold of frames/time after which which may consider a person as threat instead of instantly considering him/her as threat 
as soon as the criteria is met, which is the case with this demo solution.

In [None]:
from deep_sort.tracker import Tracker as DeepSortTracker
from tools import generate_detections_ as gdet
from deep_sort import nn_matching
from deep_sort.detection import Detection
import numpy as np
import cv2 
from ultralytics import YOLO
import numpy as np
import random

class Tracker:
    tracker = None
    encoder = None
    tracks = None

    def __init__(self):
        max_cosine_distance = 0.4
        nn_budget = None

        encoder_model_filename = 'mars-small128.pb'

        # distance metric to be used in the Hungarian algorithm for the data association problem
        metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)

        # creating a multi-target tracker object with distance metric
        self.tracker = DeepSortTracker(metric)

        # object for generating image patches using bounding box for feature extraction later in the code
        self.encoder = gdet.create_box_encoder(encoder_model_filename, batch_size=1)

    # takes care of creation, keeping track of all tracks.
    def update(self, frame, detections):
        if len(detections) == 0:
            self.tracker.predict()
            self.tracker.update([])  
            self.update_tracks()
            return

        bboxes = np.asarray([d[:-1] for d in detections])
        bboxes[:, 2:] = bboxes[:, 2:] - bboxes[:, 0:2]
        scores = [d[-1] for d in detections]

        features = self.encoder(frame, bboxes)

        dets = []
        for bbox_id, bbox in enumerate(bboxes):
            dets.append(Detection(bbox, scores[bbox_id], features[bbox_id]))

        self.tracker.predict()
        self.tracker.update(dets)
        self.update_tracks()

    def update_tracks(self):
        tracks = []
        for track in self.tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()

            id = track.track_id

            tracks.append(Track(id, bbox))

        self.tracks = tracks


class Track:
    track_id = None
    bbox = None

    def __init__(self, id, bbox):
        self.track_id = id
        self.bbox = bbox

'''
Function to create a local coordinate system -> elbow as origin and shoulder, wrist as poits wrt it.
'''
def relative_pos(o,p2,p3):
    p2=np.array(o)-np.array(p2)
    p3=np.array(o)-np.array(p3)
    return (p2,p3)

'''
Function to calculate angle between two vectors-> angle between wrist and shoulder, considering elbow as origin
'''
def angle_between(v1, v2):
  cos_theta = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
  return np.degrees(np.arccos(np.clip(cos_theta, -1.0, 1.0)))

'''
Function to estimate whether a person is a threat or not
based on location of his joints
'''
def estimate_threat(kypts):
    flag=False
    kypts=[[x,y] if p>0.5 else [0,0] for x,y,p in kypts]
    rs,ls=kypts[6],kypts[5] #shoulders c,y coord
    rw,lw=kypts[10],kypts[9] #wrist x,y coord
    rel,lel=kypts[8],kypts[7] #elbow x,y coord
    
    '''
    Check if any of the wrist is raised above the elbow level and the angle 
    formed between the shoulder and hand of the respective hand is greater than 90
    then we may consider it as a threating gesture.
    '''
    if (rel!=[0,0]) & (rs!=[0,0]) & (rw!=[0,0]):
        if rw[1]<=rel[1]:
            p1,q1=relative_pos(rel,rw,rs)
            angle=angle_between(q1,p1)
            if angle>90:
                flag=True
                return flag
    if (lel!=[0,0]) & (ls!=[0,0]) & (lw!=[0,0]):
        if lw[1]<=lel[1]:
            p1,q1=relative_pos(lel,lw,ls)
            angle=angle_between(q1,p1)
            if angle>90:
                flag=True
                return flag
    return flag


# Load the YOLOv8-pose model for object detection & gathering coordinate data of joints
pose_model=YOLO('yolov8n-pose.pt')

# Create a tracker object of DeepSORT
tracker=Tracker()

# List of random colors (BGR)
colors=[(random.randint(0,255),random.randint(0,255),random.randint(0,200)) for j in range(10)]

# Load input file
cap = cv2.VideoCapture(0) # if not using webcam then replace 0 with video path
success, frame= cap.read()

while cap.isOpened():
     success, frame= cap.read()
     if success:
          results=pose_model(frame,conf=0.2,classes=[0],show=False)
          detections=[]
          font = cv2.FONT_HERSHEY_SIMPLEX
          try:
             for result in results:
               for r,pr in zip(result.boxes.data.tolist(),result.keypoints.data.tolist()):
                    x1,y1,x2,y2,probs,cls=r
                    x1=int(x1)
                    y1=int(y1)
                    x2=int(x2)
                    y2=int(y2)
                    cls=int(cls)
                    detections.append([x1,y1,x2,y2,probs])
                    threat=estimate_threat(pr)
                    if threat:
                        cv2.putText(frame,'THREAT',(int(x1) , int(y1)),font,1,(0,0,255),3,cv2.LINE_AA)
                    else:
                        cv2.putText(frame,'SAFE',(int(x1) , int(y1)),font,1,(0,255,0),3,cv2.LINE_AA)

               tracker.update(frame,detections)
               for track in tracker.tracks:
                    x1,y1,x2,y2=track.bbox
                    id=track.track_id
                    cv2.rectangle(frame,(int(x1) , int(y1)),(int(x2) , int(y2)), 
                            (colors[id % len(colors)]),3)
                         
          except Exception as e:
               cap.release()
               cv2.destroyAllWindows()
               raise e
           
          cv2.imshow("YOLOv8 Inference", frame)

          if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
     else:
          print('Failed!!')
cap.release()
cv2.destroyAllWindows()