# INFERENCE:
- After experimenting with so many things I have found that model 6 gives the highest validation accuracy i.e. 80%.
- I have mentioned in the analysis notebook that the probability threshold as 0.4 gives less false negative hence I am keeping the threshold as 0.4.
- I have implemented inference in two ways, one without multithreading and the other with multithreading.
- In the dataset, there were videos of 5-second length and I have extracted 20 frames from every 5 seconds of video means 4 frames per second.
- For inference to get better prediction I am using 2 seconds of the window means extracting 20 frames from 2 seconds of video i.e. 10 frames per second. 
- When you run the code you will see I have added a box in which shows the status of the input video.

In [None]:
import cv2 
import numpy as np
import matplotlib.pyplot as plt
import os 
from tqdm.notebook import tqdm
from IPython.display import HTML
from base64 import b64encode
import glob
from random import shuffle
import tensorflow as tf
import sys
from tensorflow.keras.layers import Conv3D, MaxPooling1D,Embedding,Concatenate, Input,LSTM,BatchNormalization,SpatialDropout1D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense,GlobalAveragePooling3D
from tensorflow.keras.models import Model
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from zipfile import ZipFile
from datetime import datetime
import time

%load_ext tensorboard

In [None]:
if not os.path.isfile('mdl6_wts.hdf5'):
    with ZipFile('final.zip', 'r') as zipObj:
        zipObj.extractall()
    zipObj.close()

In [None]:
def SELayer(layer):
    pool=tf.math.reduce_mean(layer, axis=[2,3,4], keepdims=True, name=None)
    flatt=Flatten()(pool)
    fc1=tf.keras.layers.Dense((int(layer.shape[1])/2), activation='relu',kernel_initializer='he_uniform')(flatt)
    fc2=tf.keras.layers.Dense(layer.shape[1], activation='relu',kernel_initializer='he_uniform')(fc1)
    sig=tf.keras.layers.Activation('sigmoid')(fc2)
    mul=tf.keras.layers.Multiply()([layer,tf.reshape(sig,shape=(-1,layer.shape[1],1,1,1))])
    return mul

In [None]:
def conv3Dnet_a(layer,k_size,pool_size):
    conv=Conv3D(kernel_size=k_size,filters=5,activation='relu',kernel_initializer='he_uniform')(layer)
    drop1=Dropout(0.1)(conv)
    norm=tf.keras.layers.BatchNormalization()(drop1)
    act=tf.keras.layers.Activation('relu')(norm)
    pool=tf.keras.layers.MaxPooling3D(pool_size=pool_size)(act)
    return pool

In [None]:
def conv3Dnet_b(layer,k1_size,k2_size,pool_size):
    conv1=Conv3D(kernel_size=k1_size,filters=3,activation='relu',kernel_initializer='he_uniform')(layer)
    drop1=Dropout(0.1)(conv1)
    norm=tf.keras.layers.BatchNormalization()(drop1)
    act1=tf.keras.layers.Activation('relu')(norm)
    conv2=Conv3D(kernel_size=k2_size,filters=3,activation='relu',kernel_initializer='he_uniform')(act1)
    drop2=Dropout(0.1)(conv2)
    act2=tf.keras.layers.Activation('relu')(drop2)
    pool=tf.keras.layers.MaxPooling3D(pool_size=pool_size)(act2)
    return pool

In [None]:
tf.keras.backend.clear_session()
input=Input((20,224,224,3))
con_net_1=conv3Dnet_a(input,k_size=(3,5,5),pool_size=(2,2,2))
se_1=SELayer(con_net_1)
con_net_2=conv3Dnet_a(se_1,k_size=(2,3,3),pool_size=(2,2,2))
se_2=SELayer(con_net_2)
con_net_3=conv3Dnet_a(se_2,k_size=(1,2,2),pool_size=(1,2,2))
se_3=SELayer(con_net_3)
con_net_4=conv3Dnet_b(se_3,k1_size=(1,2,2),k2_size=(1,2,2),pool_size=(1,2,2))
se_4=SELayer(con_net_4)
con_net_5=conv3Dnet_b(se_4,k1_size=(1,2,2),k2_size=(1,2,2),pool_size=(1,2,2))
se_5=SELayer(con_net_5)
flatt=Flatten()(se_5)
drop3=Dropout(0.4)(flatt)
fc1=tf.keras.layers.Dense(512, activation='relu',kernel_initializer='he_uniform')(drop3)
drop4=Dropout(0.4)(fc1)
fc2=tf.keras.layers.Dense(100, activation='relu',kernel_initializer='he_uniform')(drop4)
drop5=Dropout(0.3)(fc2)
out=tf.keras.layers.Dense(1, activation='sigmoid')(drop5)
model=Model(inputs=input, outputs=out)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0007),
                loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20, 224, 224 0                                            
__________________________________________________________________________________________________
conv3d (Conv3D)                 (None, 18, 220, 220, 1130        input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 18, 220, 220, 0           conv3d[0][0]                     
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 18, 220, 220, 20          dropout[0][0]                    
_______________________________________________________________________________________

In [None]:
model.load_weights('mdl6_wts.hdf5')

## Method 1: Without Multithreading
- In this method, I am using only one thread for visualizing input video and predicting violence for that video.
- I am printing probability and the time taken for each prediction.

In [None]:
# https://www.geeksforgeeks.org/python-opencv-cv2-puttext-method/
font = cv2.FONT_HERSHEY_SIMPLEX 
start_point = (80, 20) # Ending coordinate, here (220, 220)  
end_point = (320, 60) # represents the bottom right corner of rectangle
org = (100, 50)  
fontScale = 1 # fontScale 
color = (255, 0, 0) # Blue color in BGR
thickness = 4 # Line thickness of 2 px 
color_b = (0, 0, 0) # Blue color in BGR
thickness_b = 5
vidcap = cv2.VideoCapture('videoplayback_1.mp4')
count = 0
frames=[]
success=True
pred=0.0
prediction='NO VIOLENCE'
while success :
    success,frame= vidcap.read()
    if success==False:
        break
    frame = cv2.resize(frame,(224,224), interpolation=cv2.INTER_AREA)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = np.reshape(frame, (224,224,3))
    image = cv2.resize(frame,(424,424), interpolation=cv2.INTER_AREA)
    image = cv2.rectangle(image, start_point, end_point, color_b, thickness_b) 
    image = cv2.putText(image, prediction , org, font, fontScale, color, thickness, cv2.LINE_AA)
    cv2.imshow("video stream",image )
    cv2.waitKey(25)
    if count%3==0: #each 5 second of video contain 150 frames hence I am taking each 7th frame
        frames.append(frame)
    if len(frames)==20: #once I get 20 frames I giving data to model for prediction
        
        frames=np.array(frames)
        image=tf.image.per_image_standardization(frames)
        image=tf.reshape(image,shape=(1,20,224,224,3))
        start = time.time()
        pred=model.predict(image)
        end = time.time()
        print('violence_prob',pred)
        
        if pred[0]>0.4:
            prediction='VIOLENCE'
            color = (0, 0,255)
            org = (120, 50) 
        else:
            prediction='NO VIOLENCE'
            color = (255, 0, 0)
            org = (100, 50) 
        

        # total time taken
        print(f"prediction time {end - start}")
        frames=[]
    count=count+1
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()


vidcap.release()

violence_prob [[0.0965968]]
prediction time 1.2168927192687988
violence_prob [[0.12539954]]
prediction time 1.1324501037597656
violence_prob [[0.7220209]]
prediction time 1.2344350814819336
violence_prob [[0.14148954]]
prediction time 1.2813127040863037
violence_prob [[0.0678285]]
prediction time 1.2240521907806396
violence_prob [[0.46223605]]
prediction time 1.2969388961791992
violence_prob [[0.02798069]]
prediction time 1.2703886032104492


### Observations:
- My system is a simple CPU based system which works with quite an old i3 processor.
- Prediction for each window of the video is taking approximately 1 second, and we can see, the video getting stuck for that prediction time.
- In real-time even if the prediction will take more time still the video we will get from CCTV will be continuous and it doesn't stuck like our visualization getting stuck.
- It is happening because I am using a single processor to visualizing video and predicting violence.

## Method 2: Multithreading 
- In a real-time video coming from the CCTV will visualize on-screen using one processor and the prediction of violence will definitely use a different processor, and this thing will manage by the computer itself.
- Here I am using one thread to visualize input video and using another thread to give predictions, hence it looks more real-time.

In [None]:
def prediction():
    """
    it take frames from load_video function and predict the violence
    """
    global pred_var
    global org
    global color
    k=0 #it use to get array of frames sequentially
    run=True
    while run:
        try:
            
            frames=np.load('frame'+str(k)+'.npy') #read the frames stored in npy file by the load_vidoe function
            image=tf.image.per_image_standardization(frames)
            image=tf.reshape(image,shape=(1,20,224,224,3))
            start = time.time()
            pred=model.predict(image)
            print('proba',pred)
            end = time.time()
            print(f"prediction {end - start}")
            if pred[0]>0.4:
                pred_var='VIOLENCE'
                color = (0, 0,255)
                org = (120, 50)
            else:
                pred_var='NO VIOLENCE'
                color = (255, 0, 0)
                org = (100, 50)
            os.remove('frame'+str(k)+'.npy')
            k=k+1
        except:
            continue
        
        
    cv2.destroyAllWindows()


In [None]:
import cv2
import numpy as np
from threading import Thread

def load_video():
    font = cv2.FONT_HERSHEY_SIMPLEX 
    start_point = (80, 20) # Ending coordinate, here (220, 220)  
    end_point = (320, 60) # represents the bottom right corner of rectangle
    fontScale = 1 # fontScale 
    thickness = 4 # Line thickness of 2 px 
    color_b = (0, 0, 0) # Blue color in BGR
    thickness_b = 5
    count=0
    s=0
    vid = cv2.VideoCapture('videoplayback_1.mp4')
    frames=[]
    success=True
    while success :
        success,frame= vid.read()
        if success==False:
            break
        frame = cv2.resize(frame,(224,224), interpolation=cv2.INTER_AREA)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = np.reshape(frame, (224,224,3))
        image = cv2.resize(frame,(424,424), interpolation=cv2.INTER_AREA)
        image = cv2.rectangle(image, start_point, end_point, color_b, thickness_b) 
        image = cv2.putText(image, pred_var, org, font, fontScale, color, thickness, cv2.LINE_AA)
        cv2.imshow("video stream",image )
        if count%3==0:
            frames.append(frame)
        count=count+1
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
        if len(frames)==20:
            np.save('frame'+str(s)+'.npy',np.array(frames)) #saving frames as npy file so that prediction function could.....
            #get access to the frames extracted by this function
            frames=[]
            s=s+1 #it use to give name to .npy file
    in_vid = np.array(frames)
    vid.release()
    cv2.destroyAllWindows()





pred_var='NO VIOLENCE'
org = (100, 50) 
color = (255, 0, 0) # Blue color in BGR
t2= Thread(target = load_video)
t1= Thread(target = prediction)
t2.start()
t1.start()
cv2.destroyAllWindows()



proba [[0.0965968]]
prediction 2.219330310821533
proba [[0.12539954]]
prediction 1.353900671005249
proba [[0.7220209]]
prediction 1.2031834125518799
proba [[0.14148954]]
prediction 1.3113369941711426
proba [[0.0678285]]
prediction 1.1714770793914795
proba [[0.46223605]]
prediction 1.1834166049957275
proba [[0.02798069]]
prediction 1.2777040004730225
proba [[0.17393102]]
prediction 1.4151017665863037
proba [[0.18131189]]
prediction 1.1875569820404053
proba [[0.22220267]]
prediction 1.2952513694763184
proba [[0.20460574]]
prediction 1.4219462871551514
proba [[0.48522833]]
prediction 1.3917770385742188
proba [[0.50561386]]
prediction 1.1875572204589844
proba [[0.42332336]]
prediction 1.4472758769989014
proba [[0.16204146]]
prediction 1.5782034397125244
proba [[0.7450101]]
prediction 1.4753801822662354
proba [[0.57157993]]
prediction 3.2990429401397705
proba [[0.57157993]]
prediction 3.297426700592041
proba [[0.4599659]]
prediction 1.381849765777588
proba [[0.313421]]
prediction 1.45266294

### Observations:
- There is some false prediction, this is expected because we got 80% accuracy.
- I think when we will get continuous 2 to 5 warnings of violence then we should take action because I have observed False warning of violence is not continuous, and actual violence warning appears 2 3 times because violence usually doesn't get over in 2 seconds.

## Getting output with GPU:

- I have run this part of the code in google colab, so that we can check the result with GPU also.
- cv2.imshow() doesn't work with colab hence I am using matplotlib animation function to get this visualization.
- I am not using multiprocessing here, because with GPU prediction time is approx 50 milliseconds, hence it looks the same as real-time.

In [None]:
%matplotlib notebook
from matplotlib import pyplot as plt 
import numpy as np 
from matplotlib.animation import FuncAnimation 

In [None]:
def load_video():
    font = cv2.FONT_HERSHEY_SIMPLEX 
    start_point = (80, 20) # Ending coordinate, here (220, 220)  
    end_point = (320, 60) # represents the bottom right corner of rectangle
    org = (100, 50)  
    fontScale = 1 # fontScale 
    color = (0, 0,255) # Blue color in BGR
    thickness = 4 # Line thickness of 2 px 
    color_b = (0, 0, 0) # Blue color in BGR
    thickness_b = 5
    vidcap = cv2.VideoCapture('videoplayback_1.mp4')
    count = 0
    frames=[]
    success=True
    prediction='NO VIOLENCE'
    while success :
        success,frame= vidcap.read()
        if success==False:
            break
        frame = cv2.resize(frame,(224,224), interpolation=cv2.INTER_AREA)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = np.reshape(frame, (224,224,3))
        image = cv2.resize(frame,(424,424), interpolation=cv2.INTER_AREA)
        image = cv2.rectangle(image, start_point, end_point, color_b, thickness_b) 
        image_ = cv2.putText(image, prediction, org, font, fontScale, color, thickness, cv2.LINE_AA)
        if count%3==0: #each 5 second of video contain 150 frames hence I am taking each 7th frame
            frames.append(frame)
        if len(frames)==20: #once I get 20 frames I giving data to model for prediction
        
            frames=np.array(frames)
            image=tf.image.per_image_standardization(frames)
            image=tf.reshape(image,shape=(1,20,224,224,3))
            start = time.time()
            pred=model.predict(image)
            end = time.time()
            print('violence_prob',pred)
        
            if pred[0]>0.4:
                prediction='VIOLENCE'
                color = (255, 0,0)
                org = (120, 50) 
            else:
                prediction='NO VIOLENCE'
                color = (0, 0, 255)
                org = (100, 50) 
        

            # total time taken
            print(f"prediction time {end - start}")
            frames=[]
        count=count+1
        yield image_


    vidcap.release()

In [None]:
fig,ax = plt.subplots() 
image = cv2.imread('NO_VIOLENCE.png')
line = ax.imshow(image) 


def show_f(e):
    line.set_data(e)
    return line

anim = FuncAnimation(fig,  show_f,  frames = load_video,interval = 20,repeat=False,blit = True,save_count=3144) 

anim.save('output_with_GPU.mp4',writer = 'ffmpeg', fps = 30)

<IPython.core.display.Javascript object>

violence_prob [[0.09675204]]
prediction time 0.04960966110229492
violence_prob [[0.12437101]]
prediction time 0.05294489860534668
violence_prob [[0.7213771]]
prediction time 0.04805588722229004
violence_prob [[0.14154276]]
prediction time 0.0482022762298584
violence_prob [[0.06740767]]
prediction time 0.04903864860534668
violence_prob [[0.4619738]]
prediction time 0.047614336013793945
violence_prob [[0.02797326]]
prediction time 0.05232882499694824
violence_prob [[0.17352913]]
prediction time 0.046449899673461914
violence_prob [[0.1816435]]
prediction time 0.04693865776062012
violence_prob [[0.22156796]]
prediction time 0.05076122283935547
violence_prob [[0.20494768]]
prediction time 0.05040693283081055
violence_prob [[0.48580834]]
prediction time 0.05139327049255371
violence_prob [[0.5051299]]
prediction time 0.047771453857421875
violence_prob [[0.42287347]]
prediction time 0.048868656158447266
violence_prob [[0.16180274]]
prediction time 0.04840588569641113
violence_prob [[0.7448874]