## Goal

Using ML algorithms such as transfer learing and Open CV Computer Vision to identify the objects in the video and build KPI metrics

### Steps:

1. Download the dataset with video data and label
2. Download Yolo config and weights by going to the [link](https://pjreddie.com/darknet/yolo/). Copy these files in /cfg project directory
3. Download coco.names from [here](https://drive.google.com/file/d/1AoYGMJ7FxS4a0KVnXxmMSIbKfpb8TYDM/view?usp=sharing). Copy this file in /cfg project directory
4. Create a python service which will overlay the text on the video frame
5. Once we achieve the accuracy and performance - Create a docker which can be deployed to cloud platform - Azure
6. Save it to the database
7. Create a dashboard with KPI metrics

In [16]:
#!pip install matplotlib

In [17]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import argparse
import time

In [18]:
def load_yolo3():
    cfg = cv2.dnn.readNet("cfg/yolov3.weights", "cfg/yolov3.cfg")
    classNames = []
    
    classFile = 'cfg/coco.names'

    with open(classFile, 'rt') as f:
        classNames = f.read().rstrip('\n').split('\n')

    print(classNames)
    
    
    layers_names = cfg.getLayerNames()
    output_layers = [layers_names[i[0]-1] for i in cfg.getUnconnectedOutLayers()]
    colors = np.random.uniform(0, 255, size=(len(classNames), 3))
    
    return cfg, classNames, colors, output_layers

In [19]:
def load_image(img_path):
	# image loading
	img = cv2.imread(img_path)
	img = cv2.resize(img, None, fx=0.4, fy=0.4)
	height, width, channels = img.shape
	return img, height, width, channels

In [20]:
def display_blob(blob):
	'''
		Three images each for RED, GREEN, BLUE channel
	'''
	for b in blob:
		for n, imgb in enumerate(b):
			cv2.imshow(str(n), imgb)

In [21]:
def detect_objects(img, net, outputLayers):			
	blob = cv2.dnn.blobFromImage(img, scalefactor=0.00392, size=(320, 320), mean=(0, 0, 0), swapRB=True, crop=False)
	net.setInput(blob)
	outputs = net.forward(outputLayers)
	return blob, outputs

In [22]:
def get_box_dimensions(outputs, height, width):
	boxes = []
	confs = []
	class_ids = []
	for output in outputs:
		for detect in output:
			scores = detect[5:]
			class_id = np.argmax(scores)
			conf = scores[class_id]
			if conf > 0.8:
				center_x = int(detect[0] * width)
				center_y = int(detect[1] * height)
				w = int(detect[2] * width)
				h = int(detect[3] * height)
				x = int(center_x - w/2)
				y = int(center_y - h / 2)
				boxes.append([x, y, w, h])
				confs.append(float(conf))
				class_ids.append(class_id)
	return boxes, confs, class_ids

In [23]:
def draw_labels(boxes, confs, colors, class_ids, classes, img): 
	indexes = cv2.dnn.NMSBoxes(boxes, confs, 0.5, 0.4)
	font = cv2.FONT_HERSHEY_PLAIN
	for i in range(len(boxes)):
		if i in indexes:
			x, y, w, h = boxes[i]
			label = str(classes[class_ids[i]])
			color = colors[i]
			cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
			cv2.putText(img, label, (x, y - 5), font, 1, color, 1)
	cv2.imshow("Image", img)

In [24]:
def image_detect(img_path): 
	model, classes, colors, output_layers = load_yolo3()
	image, height, width, channels = load_image(img_path)
	blob, outputs = detect_objects(image, model, output_layers)
	boxes, confs, class_ids = get_box_dimensions(outputs, height, width)
	draw_labels(boxes, confs, colors, class_ids, classes, image)
	while True:
		key = cv2.waitKey(1)
		if key == 27:
			break

In [25]:
def webcam_detect():
	model, classes, colors, output_layers = load_yolo3()
	cap = start_webcam()
	while True:
		_, frame = cap.read()
		height, width, channels = frame.shape
		blob, outputs = detect_objects(frame, model, output_layers)
		boxes, confs, class_ids = get_box_dimensions(outputs, height, width)
		draw_labels(boxes, confs, colors, class_ids, classes, frame)
		key = cv2.waitKey(1)
		if key == 27:
			break
	cap.release()

In [26]:
def start_video(video_path):
	model, classes, colors, output_layers = load_yolo3()
	cap = cv2.VideoCapture(video_path)
	while True:
		_, frame = cap.read()
		height, width, channels = frame.shape
		blob, outputs = detect_objects(frame, model, output_layers)
		boxes, confs, class_ids = get_box_dimensions(outputs, height, width)
		draw_labels(boxes, confs, colors, class_ids, classes, frame)
		key = cv2.waitKey(1)
		if key == 27:
			break
	cap.release()

In [27]:
if __name__ == '__main__':
	webcam = False
	video_play = True
	image = False
	if webcam:
		print('---- Starting Web Cam object detection ----')
		webcam_detect()
	if video_play:
		video_path = "data/VIRAT_S_000200_00_000100_000171.mp4"
		print('Opening '+video_path+" .... ")
		start_video(video_path)
	if image:
		image_path = ""
		print("Opening "+image_path+" .... ")
		image_detect(image_path)
	

	cv2.destroyAllWindows()

Opening data/VIRAT_S_000200_00_000100_000171.mp4 .... 
['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
