# Object Detection using SSD

### Importing the libraries

In [1]:
import torch
from torch.autograd import Variable
import cv2
from data import BaseTransform, VOC_CLASSES as labelmap
from ssd import build_ssd
import imageio

### Defining a function that will do the detections

In [2]:
def detect(frame, net, transform): 
    ''' # We define a detect function that will take as inputs, a frame, a ssd neural network,
    and a transformation to be applied on the images, and that will return the frame with the detector rectangle.'''
    
    # We get the height and the width of the frame.
    height, width = frame.shape[:2] 
    
    
    #Applying transformations..
    # We apply the transformation to our frame.(returns two output but we need only one i.e. transformed frame)
    frame_t = transform(frame)[0] 
    
    # We convert the frame into a torch tensor. (color code :- grb as ssd is trained on this color code)
    x = torch.from_numpy(frame_t).permute(2, 0, 1) 
    
    # We add a fake dimension corresponding to the batch. and converting to tensor variable
    x = Variable(x.unsqueeze(0)) 
    #transformations over...
    
    # We feed the neural network ssd with the image and we get the output y.
    y = net(x) 
    
    # We create the detections tensor contained in the output y.
    detections = y.data
    
    #detections=[batch,number of classes,number of occurence,[score,x0,y0,x1,y1]]
    
    # We create a tensor object of dimensions [width, height, width, height].
    #First (width,height) --> top left corner
    #Second (width,height) --> bottom right corner
    scale = torch.Tensor([width, height, width, height])
    
    # For every class:
    for i in range(detections.size(1)): 
        
        # We initialize the loop variable j that will correspond to the occurrences of the class.
        j = 0 
        
        # We take into account all the occurrences j of the class i that have a matching score larger than 0.6.
        #[batch,class,occurence,score]
        while detections[0, i, j, 0] >= 0.6: 
            
            # We get the coordinates of the points at the upper left and the lower right of the detector rectangle.
            #Need to convert tensor to numpy to plot rectangle in openCV
            pt = (detections[0, i, j, 1:] * scale).numpy() 
            
            # We draw a rectangle around the detected object.
            cv2.rectangle(frame, (int(pt[0]), int(pt[1])), (int(pt[2]), int(pt[3])), (255, 0, 0), 2) 
            
             # We put the label of the class right above the rectangle.
            cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_AA)
            
            # We increment j to get to the next occurrence.
            j += 1 
            
    # We return the original frame with the detector rectangle and the label around the detected object.        
    return frame 

### Creating the SSD neural network

In [3]:
 # We create an object that is our neural network ssd.
net = build_ssd('test')

# We get the weights of the neural network from another one that is pretrained
# map_location = lambda storage, loc: storage need to put them as parameters while loading model
net.load_state_dict(torch.load('ssd300_mAP_77.43_v2.pth', map_location = lambda storage, loc: storage))

  self.priors = Variable(self.priorbox.forward(), volatile=True)


<All keys matched successfully>

### Creating the transformation

In [4]:
# We create an object of the BaseTransform class, a class that will do the required transformations
#so that the image can be the input of the neural network
transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) 

#net.size -> transformation of frame according to ssd
#other numbers are the numbers that ssd is trained on..no need to remember

### Doing some Object Detection on a video

In [5]:
# We open the video.
reader = imageio.get_reader('funny_dog.mp4') 

# We get the fps frequence (frames per second).
fps = reader.get_meta_data()['fps'] 

# We create an output video with this same fps frequence.
writer = imageio.get_writer('output.mp4', fps = fps)

# We iterate on the frames of the output video:
for i, frame in enumerate(reader): 
    
    # We call our detect function (defined above) to detect the object on the frame.
    frame = detect(frame, net.eval(), transform) 
    
    # We add the next frame in the output video.
    writer.append_data(frame) 
    
     # We print the number of the processed frame.
    print(i)
    
# We close the process that handles the creation of the output video.
writer.close() 

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
