In [1]:
import time
import cv2 as cv
import numpy as np
import math

In [2]:
#loading the model and it's architecture 
prototxt_file = "deploy.prototxt"
model_file = "mobilenet_iter_73000.caffemodel"

In [3]:
#classes MobileNet can detect 
classes = ["background", "aeroplane", "bicycle", "bird", "boat",
    "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
    "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
    "sofa", "train", "tvmonitor"]

### Step 1. Collect a source video. It may be necessary to divide the video into discrete image frames.

### Step 2. Conduct inference on each frame of the video, drawing bounding boxes around detected vehicles.

In [4]:
net = cv.dnn.readNetFromCaffe(prototxt_file, model_file)

def frame_processing(next_f):
    rgb = cv.cvtColor(next_f, cv.COLOR_BGR2RGB)
    (height, width) = next_f.shape[:2]

    # Creating a blob of frame so that it can pass through the model 
    
    blob = cv.dnn.blobFromImage(next_f, size=(300, 300), ddepth=cv.CV_8U)
    net.setInput(blob, scalefactor=1.0/127.5, mean=[127.5, 127.5, 127.5])
    prediction = net.forward()

    # loop over the detections
    for i in np.arange(0, prediction.shape[2]):
        # Confidence for the prediction
        
        confidence = prediction[0, 0, i, 2]
        # Filter those predictions which aren't close to the confidence of this
        
        if confidence > 0.4:
            # getting the idx of prediction
            
            idx = int(prediction[0, 0, i, 1])
            
            if classes[idx] != "car":
                continue
            
            box = prediction[0, 0, i, 3:7] * np.array([width, height, width, height ])
            (s_X, s_Y, e_X, e_Y) = box.astype("int")
            
            cv.rectangle(next_f, (s_X, s_Y), (e_X, e_Y), (0, 255, 0), 3)

    return next_f

### Step 3. Format the results back into a video.

In [5]:
def cardetection(filename):
    cap = cv.VideoCapture(filename)

    # output file dimensions
    width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))

    # Output file type 
    fps = 20
    size = (int(width),int(height))
    
    # step 3: formatting into video 
    format_v = cv.VideoWriter_fourcc('m','p','4','v')
    output = cv.VideoWriter()
    success = output.open('output_video.mp4', format_v, fps, size, True)

    print("Algorithm Successfully started")
    
    ### Step 1. Collect a source video. It may be necessary to divide the video into discrete image frames
    while True:
        ret, next_f = cap.read() # To read next video frame into memory 
        
        if ret == False: break

        next_f = frame_processing(next_f)
        
        # step 3: formatting into video 
        output.write(next_f)
        
        key = cv.waitKey(50)
        
        if key == 27: # Hit ESC key to stop
            break


    print("Car detection video successfully generated")

    cap.release()
    cv.destroyAllWindows()
    output.release()

In [6]:
### Step 1. Collect a source video. It may be necessary to divide the video into discrete image frames
## paste the link to the video here 
cardetection('raw_video.mp4')

Algorithm Successfully started
Car detection video successfully generated


### Comments and Observations:

In this Algorithm I've use the OpenCV libraries to split the video into frames. The cv.read() reads frame into the memory and that frame is processed using frame_processing function. after processing it, the frame is appeneded into an outpus file. Pre-trained MobileNet model is used to detect images. There are a lot of classes that the MobileNet dataset can detect but here I've used it to just detect Cars. Opencv let's you use a pre-trained model and using cafee, we can pass the drame through a deep CNN model. 

The prototext file has the structure of the Neural Netwoek and .caffemodel file has a pre-trained mobilenet model built by caffe. 

The reason that the video is not able to detect far off cars is the confidence level. When the algorithm is really sure it's a car, then only it can detect it as a car. That only happens when the car is closer in the frame and fetches features that result in high confidence. 