OBJECT DETECTION WITH YOLOv2 AND OBJECT TRACKING & CAR COUNTING WITH SORT

In [27]:
from sort import *

#create instance of SORT
mot_tracker = Sort() 


In [28]:
# import required packages
import cv2
import numpy as np

In [29]:
# read class names from text file
with open('C:/Users/ritti/OneDrive/Desktop/MT2022090,MT2022162/object-detection-opencv-master/yolov2.txt', 'r') as f:
        classes = [line.strip() for line in f.readlines()]

    # generate different colors for different classes 
COLORS = np.random.uniform(0, 255, size=(len(classes), 3))

    # read pre-trained model and config file
net = cv2.dnn.readNet('C:/Users/ritti/OneDrive/Desktop/MT2022090,MT2022162/object-detection-opencv-master/yolov2.weights','C:/Users/ritti/OneDrive/Desktop/MT2022090,MT2022162/object-detection-opencv-master/yolov2.cfg')

In [30]:
# function to get the output layer names 
# in the architecture
def get_output_layers(net):
    
    layer_names = net.getLayerNames()
    
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

    return output_layers

# function to draw bounding box on the detected object with class name
def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):

    label = str(classes[class_id])

    color = COLORS[class_id]

    cv2.rectangle(img, (x,y), (x_plus_w,y_plus_h), color, 2)

    cv2.putText(img, label, (x-10,y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,0), 2)

    return img

In [31]:
def yolov2(image):
    n_i = []
    Width = image.shape[1]
    Height = image.shape[0]
    scale = 0.00392

    # create input blob 
    blob = cv2.dnn.blobFromImage(image, scale, (416,416), (0,0,0), True, crop=False)

    # set input blob for the network
    net.setInput(blob)

    # run inference through the network
    # and gather predictions from output layers
    outs = net.forward(get_output_layers(net))

    # initialization
    class_ids = []
    confidences = []
    boxes = []
    conf_threshold = 0.5
    nms_threshold = 0.4

    # for each detetion from each output layer 
    # get the confidence, class id, bounding box params
    # and ignore weak detections (confidence < 0.5)
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and classes[class_id]=='car':
                center_x = int(detection[0] * Width)
                center_y = int(detection[1] * Height)
                w = int(detection[2] * Width)
                h = int(detection[3] * Height)
                x = center_x - w / 2
                y = center_y - h / 2
                class_ids.append(class_id)
                confidences.append(float(confidence))
                boxes.append([x, y, w, h])
    
    # apply non-max suppression
    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold, nms_threshold)

    # go through the detections remaining
    # after nms and draw bounding box
    for i in indices:
        box = boxes[i]
        x = box[0]
        y = box[1]
        w = box[2]
        h = box[3]
        n_i.append([round(x), round(y),round(x+w),round(y+h),confidences[i]])

    return image,n_i      


In [32]:
# Load the video file using OpenCV
cap = cv2.VideoCapture("C:/Users/ritti/OneDrive/Desktop/MT2022090,MT2022162/object-detection-opencv-master/Video3.mp4")
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

In [33]:
frames = []
car_ids = []
while cap.isOpened():
    # Extract the next frame from the video
    ret, frame = cap.read()

    if not ret:
        break
    
    f,n_i = yolov2(frame)
    
    
    if(len(n_i)==0):
      frames.append(f)
      continue
    
    #preds=model(img)
    #d = preds.pred[0].numpy()
    #print(d)
    
    tracks = mot_tracker.update(np.array(n_i))
  
    
    for track in tracks:
      left, top, right, bottom, track_id = track
      print(track_id)
      car_ids.append(track_id)
      
        
      cv2.rectangle(f,(int(left), int(top)),(int(right), int(bottom)),(0,255,255),2)
      cv2.putText(f, "ID:" + str(track_id), (int(left+33), int(top)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
      cv2.putText(f, 'CAR', (int(left), int(top)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,0), 2)
  
    
    
    frames.append(f)
    #cv2_imshow(f)
    
cap.release()
cv2.destroyAllWindows()

92.0
94.0
93.0
92.0
93.0
92.0
95.0
95.0
97.0
97.0
97.0
97.0
97.0
97.0
97.0
99.0
97.0
100.0
99.0
99.0
99.0
99.0
99.0
99.0
99.0
99.0
99.0
104.0
99.0
104.0
99.0
104.0
106.0
104.0
106.0
104.0
106.0
104.0
106.0
104.0
106.0
104.0
106.0
106.0
106.0
111.0
106.0
112.0
111.0
106.0
112.0
111.0
106.0
112.0
111.0
106.0
112.0
111.0
106.0
113.0
112.0
111.0
106.0
113.0
112.0
111.0
106.0
113.0
112.0
111.0
106.0
113.0
112.0
111.0
113.0
112.0
111.0
113.0
112.0
111.0
113.0
112.0
111.0
113.0
112.0
111.0
115.0
113.0
112.0
111.0
115.0
114.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
112.0
111.0
115.0
113.0
112.0
111.0
115.0
113.0
112.0
111.0
115.0
113.0
112.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
113.0
111.0
115.0
11

In [34]:
unique_ids = set(car_ids)
num_cars = len(unique_ids)
print("Number of cars:{}".format(num_cars))

Number of cars:24


In [35]:
output_file = 'vid3_yolov2_sort.mp4'


fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_file, fourcc, fps, (width,height))

for frame in frames:
    out.write(frame)

out.release()

OBJECT DETECTION WITH FASTER RCNN AND OBJECT TRACKING & CAR COUNTING WITH SORT

In [44]:
from sort import *

#create instance of SORT
mot_tracker = Sort() 

In [45]:
import torch
import torchvision
import cv2

In [46]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [47]:
# Load the video file using OpenCV
cap = cv2.VideoCapture('C:/Users/ritti/OneDrive/Desktop/MT2022090,MT2022162/object-detection-opencv-master/video3.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
print(fps)
# Define the video wri
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(width)
print(height)

29.88663410085282
352
640


In [48]:
coco_names = ["person" , "bicycle" , "car" , "motorcycle" , "airplane" , "bus" , "train" , "truck" , "boat" , "traffic light" , "fire hydrant" , "street sign" , "stop sign" , "parking meter" , "bench" , "bird" , "cat" , "dog" , "horse" , "sheep" , "cow" , "elephant" , "bear" , "zebra" , "giraffe" , "hat" , "backpack" , "umbrella" , "shoe" , "eye glasses" , "handbag" , "tie" , "suitcase" , 
"frisbee" , "skis" , "snowboard" , "sports ball" , "kite" , "baseball bat" , 
"baseball glove" , "skateboard" , "surfboard" , "tennis racket" , "bottle" , 
"plate" , "wine glass" , "cup" , "fork" , "knife" , "spoon" , "bowl" , 
"banana" , "apple" , "sandwich" , "orange" , "broccoli" , "carrot" , "hot dog" ,
"pizza" , "donut" , "cake" , "chair" , "couch" , "potted plant" , "bed" ,
"mirror" , "dining table" , "window" , "desk" , "toilet" , "door" , "tv" ,
"laptop" , "mouse" , "remote" , "keyboard" , "cell phone" , "microwave" ,
"oven" , "toaster" , "sink" , "refrigerator" , "blender" , "book" ,
"clock" , "vase" , "scissors" , "teddy bear" , "hair drier" , "toothbrush" , "hair brush"]

In [49]:
frames = []
car_ids = []
while cap.isOpened():
    # Extract the next frame from the video
    ret, frame = cap.read()

    if not ret:
        break
    
    
    
    n_i = []
    
  
    
    transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
    input_image = transform(frame)
    
        # Pass the image through the model
    with torch.no_grad():
        output = model([input_image])
           # Draw the predicted bounding boxes on the image
        boxes = output[0]['boxes']
        labels = output[0]['labels']
        scores = output[0]['scores']
        
        num = torch.argwhere(scores>0.8).shape[0]
  
        for i in range(num):
          x1,y1,x2,y2 = boxes[i].numpy().astype("int")
          class_name = coco_names[labels.numpy()[i]-1]
          if(class_name == "car"):
            n_i.append([x1,y1,x2,y2,scores[i]])
        
        if(len(n_i)==0):
          frames.append(frame)
          continue
        
    tracks = mot_tracker.update(np.array(n_i))
  
    
    for track in tracks:
      left, top, right, bottom, track_id = track
      print(track_id)
      car_ids.append(track_id)
      
        
      cv2.rectangle(frame,(int(left), int(top)),(int(right), int(bottom)),(0,255,255),2)
      cv2.putText(frame, "ID:" + str(track_id), (int(left+33), int(top)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
      cv2.putText(frame, 'CAR', (int(left), int(top)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,0), 2)
  
    
    
    frames.append(frame)
    #cv2_imshow(f)
    
cap.release()
cv2.destroyAllWindows()

138.0
137.0
136.0
135.0
134.0
137.0
136.0
135.0
134.0
137.0
136.0
135.0
134.0
136.0
136.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
145.0
140.0
139.0
136.0
145.0
140.0
139.0
136.0
140.0
139.0
136.0
139.0
136.0
139.0
136.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
148.0
147.0
140.0
139.0
136.0
148.0
147.0
140.0
139.0
136.0
148.0
147.0
140.0
139.0
136.0
148.0
147.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.0
140.0
139.0
136.0
150.0
147.

In [50]:
unique_ids = set(car_ids)
num_cars = len(unique_ids)
print("Number of cars:{}".format(num_cars))

Number of cars:13


In [51]:
output_file = 'vid3_frcnn_sort.mp4'


fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_file, fourcc, fps, (width,height))

for frame in frames:
    out.write(frame)

out.release()