# **Data Preprocessing**

In [47]:
cd /content/drive/MyDrive/DangerousDrivingRecognition

/content/drive/MyDrive/DangerousDrivingRecognition


In [48]:
!pip install tensorflow_model_optimization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [49]:
from tensorflow import keras
from imutils import paths

from imutils import paths
from tqdm import tqdm
import shutil
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os
from time import time
import shutil

from tensorflow.keras.models import load_model

In [50]:
# Open the .txt file which have names of training videos
f = open("/content/drive/MyDrive/DangerousDrivingRecognition/traintestlist/trainlist.txt", "r")
temp = f.read()
videos = temp.split('\n')

# Create a dataframe having video names
train = pd.DataFrame()
train['video_name'] = videos
train = train[:-1]

In [51]:
# Open the .txt file which have names of test videos
with open("/content/drive/MyDrive/DangerousDrivingRecognition/traintestlist/testlist.txt", "r") as f:
    temp = f.read()
videos = temp.split("\n")

# Create a dataframe having video names
test = pd.DataFrame()
test["video_name"] = videos
test = test[:-1]

In [52]:
def extract_tag(video_path):
    return video_path.split("/")[1].split("_")[0]

def separate_video_name(video_name):
    return video_name.split("/")[1]

def rectify_video_name(video_name):
    return video_name.split(" ")[0]

def move_videos(df, output_dir):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    for i in tqdm(range(df.shape[0])):
        videoFile = df['video_name'][i].split("/")[-1]
        videoPath = os.path.join("data", videoFile)
        shutil.copy2(videoPath, output_dir)
    print()
    print(f"Total videos: {len(os.listdir(output_dir))}")

train["tag"] = train["video_name"].apply(extract_tag)
train["video_name"] = train["video_name"].apply(separate_video_name)

In [53]:
train["video_name"] = train["video_name"].apply(rectify_video_name)

In [54]:
test["tag"] = test["video_name"].apply(extract_tag)
test["video_name"] = test["video_name"].apply(separate_video_name)

In [55]:
n = 2
topNActs = train["tag"].value_counts().nlargest(n).reset_index()["index"].tolist()
train_new = train[train["tag"].isin(topNActs)]
test_new = test[test["tag"].isin(topNActs)]

In [56]:
train_new = train_new.reset_index(drop=True)
test_new = test_new.reset_index(drop=True)

In [57]:
train_new.to_csv("train.csv", index=False)
test_new.to_csv("test.csv", index=False)

In [58]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 472
Total videos for testing: 80


# **Hyperparameters**

In [60]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 500

MAX_SEQ_LENGTH = 30
NUM_FEATURES = 1280

# **Models**

In [15]:
MobileNet = tf.keras.applications.MobileNet(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    alpha=1.0,
    depth_multiplier=1,
    dropout=0.001,
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    pooling="avg",
    classes=2,
    classifier_activation="softmax",
)

def build_feature_extractor(conv_model):
    
    feature_extractor = conv_model

    preprocess_input = tf.keras.applications.mobilenet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="MobileNet")


MobileNet = build_feature_extractor(MobileNet)

In [16]:
MobileNetV2 = tf.keras.applications.MobileNetV2(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    alpha=1.0,
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    pooling="avg",
    classes=2,
    classifier_activation="softmax",
)

def build_feature_extractor(conv_model):
    
    feature_extractor = conv_model
    
    preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="MobileNetV2")


MobileNetV2 = build_feature_extractor(MobileNetV2)

In [17]:
MobileNetV3Small = tf.keras.applications.MobileNetV3Small(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    alpha=1.0,
    minimalistic=False,
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    classes=2,
    pooling="avg",
    dropout_rate=0.2,
    classifier_activation="softmax",
    include_preprocessing=True,
)

def build_feature_extractor(conv_model):

    feature_extractor = conv_model

    preprocess_input = tf.keras.applications.mobilenet_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="MobileNetV3Small")


MobileNetV3Small = build_feature_extractor(MobileNetV3Small)

In [18]:
NASNetMobile = tf.keras.applications.NASNetMobile(
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    pooling="avg",
    classes=2,
)

def build_feature_extractor(conv_model):
    
    feature_extractor = conv_model
    
    preprocess_input = tf.keras.applications.nasnet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="NASNetMobile")


NASNetMobile = build_feature_extractor(NASNetMobile)

In [19]:
EfficientNetB0 = tf.keras.applications.EfficientNetB0(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling="avg",
    classes=2,
    classifier_activation="softmax",
)

def build_feature_extractor(conv_model):
    
    feature_extractor = conv_model
    
    preprocess_input = keras.applications.efficientnet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="EfficientNetB0")


EfficientNetB0 = build_feature_extractor(EfficientNetB0)

In [20]:
EfficientNetV2B0 = tf.keras.applications.EfficientNetV2B0(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling="avg",
    classes=2,
    classifier_activation="softmax",
    include_preprocessing=True,
)

def build_feature_extractor(conv_model):
    
    feature_extractor = conv_model
    
    preprocess_input = keras.applications.efficientnet_v2.preprocess_input
    
    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="EfficientNetV2B0")


EfficientNetV2B0 = build_feature_extractor(EfficientNetV2B0)

# **Training**

In [21]:
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [22]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)

In [None]:
def prepare_all_videos(df, root_dir, feature_extractor):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        print(f'{idx}/{len(video_paths)}')
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):


                # interpreter = tf.lite.Interpreter(model_path="feature_extractor_quantized.tflite")
                # interpreter.allocate_tensors()
                # input_details = interpreter.get_input_details()
                # output_details = interpreter.get_output_details()
                # input_shape = input_details[0]['shape']
                # input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
                # input_data[0] = batch[None, j, :]
                # interpreter.set_tensor(input_details[0]['index'], input_data)
                # interpreter.invoke()
                # output_data = interpreter.get_tensor(output_details[0]['index'])
                # temp_frame_features[i, j, :] = output_data


                temp_frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train", MobileNetV2)
test_data, test_labels = prepare_all_videos(test_df, "test", MobileNetV2)

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

0/472
1/472
2/472
3/472
4/472
5/472
6/472
7/472
8/472
9/472
10/472
11/472
12/472
13/472
14/472
15/472
16/472
17/472
18/472
19/472
20/472
21/472
22/472
23/472
24/472
25/472
26/472
27/472
28/472
29/472
30/472
31/472
32/472
33/472
34/472
35/472
36/472
37/472
38/472
39/472
40/472
41/472
42/472
43/472
44/472
45/472
46/472
47/472
48/472
49/472
50/472
51/472
52/472
53/472
54/472
55/472
56/472
57/472
58/472
59/472
60/472
61/472
62/472
63/472
64/472
65/472
66/472
67/472
68/472
69/472
70/472
71/472
72/472
73/472
74/472
75/472
76/472
77/472
78/472
79/472
80/472
81/472
82/472
83/472
84/472
85/472
86/472
87/472
88/472
89/472
90/472
91/472
92/472
93/472
94/472
95/472
96/472
97/472
98/472
99/472
100/472
101/472
102/472
103/472
104/472
105/472
106/472
107/472
108/472
109/472
110/472
111/472
112/472
113/472
114/472
115/472
116/472
117/472
118/472
119/472
120/472
121/472
122/472
123/472
124/472
125/472
126/472
127/472
128/472
129/472
130/472

In [None]:
LSTM = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, NUM_FEATURES), name='input'),
    tf.keras.layers.LSTM(16, time_major=False, return_sequences=True),
    tf.keras.layers.LSTM(8),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(2, activation="softmax")
])
LSTM.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

GRU = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, NUM_FEATURES), name='input'),
    tf.keras.layers.GRU(16, time_major=False, return_sequences=True),
    tf.keras.layers.GRU(8),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(2, activation="softmax")
])
GRU.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

RNN = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH, NUM_FEATURES), name='input'),
    tf.keras.layers.SimpleRNN(16, time_major=False, return_sequences=True),
    tf.keras.layers.SimpleRNN(8),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(2, activation="softmax")
])
RNN.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

LSTM.summary()
GRU.summary()
RNN.summary()

In [None]:
# Utility for running experiments.
def run_experiment(seq_model):
    filepath = "/content/drive/MyDrive/DangerousDrivingRecognition/DangerousDrivingRecognition"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    history = seq_model.fit(
        [train_data[0]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
        use_multiprocessing=True,
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

print("LSTM")
history_lstm, LSTM = run_experiment(LSTM)
print("\n")
print("GRU")
history_gru, GRU = run_experiment(GRU)
print("\n")
print("RNN")
history_rnn, RNN = run_experiment(RNN)

## **Saving Models**

In [None]:
cd /content

In [None]:
MobileNet.save("MobileNet.h5")
MobileNetV2.save("MobileNetV2.h5")
MobileNetV3Small.save("MobileNetV3Small.h5")
NASNetMobile.save("NASNetMobile.h5")
EfficientNetB0.save("EfficientNetB0.h5")
EfficientNetV2B0.save("EfficientNetV2B0.h5")
RNN.save("RNN.h5")
GRU.save("GRU.h5")
LSTM.save("LSTM.h5")

# **TF Model**

In [None]:
run_model = tf.function(lambda x: LSTM(x))
# This is important, let's fix the input size.
BATCH_SIZE = BATCH_SIZE
STEPS = MAX_SEQ_LENGTH
INPUT_SIZE = NUM_FEATURES
concrete_func = run_model.get_concrete_function(
    tf.TensorSpec([BATCH_SIZE, STEPS, INPUT_SIZE], LSTM.inputs[0].dtype))

# model directory.
MODEL_DIR = "keras_lstm"
LSTM.save(MODEL_DIR, save_format="tf", signatures=concrete_func)

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_DIR)
tflite_model = converter.convert()

with open('sequence_model.tflite', 'wb') as f:
  f.write(tflite_model)

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_DIR)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# converter.representative_dataset = representative_dataset
tflite_quant_model = converter.convert()
with open('sequence_model_quant.tflite', 'wb') as f:
  f.write(tflite_quant_model)

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_DIR)
tflite_model = converter.convert()

with open('sequence_model.tflite', 'wb') as f:
  f.write(tflite_model)

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model(MODEL_DIR)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
# converter.representative_dataset = representative_dataset
tflite_quant_model = converter.convert()
with open('sequence_model_quant.tflite', 'wb') as f:
  f.write(tflite_quant_model)

# **Inference Time & Accuracy**

In [35]:
def prepare_single_video(frames, feature_extractor):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            
            # interpreter = tf.lite.Interpreter(model_path="feature_extractor_quantized.tflite")
            # interpreter.allocate_tensors()
            # input_details = interpreter.get_input_details()
            # output_details = interpreter.get_output_details()
            # input_shape = input_details[0]['shape']
            # input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
            # input_data[0] = batch[None, j, :]
            # interpreter.set_tensor(input_details[0]['index'], input_data)
            # interpreter.invoke()
            # output_data = interpreter.get_tensor(output_details[0]['index'])
            # frame_features[i, j, :] = output_data
            
            
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path, feature_extractor, sequence_model):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    start = time()
    frame_features, frame_mask = prepare_single_video(frames, feature_extractor)
    probabilities = sequence_model.predict([frame_features, ])[0]
    end = time()
    print(f'Inference Time: {end - start}')
    inference_time = end - start

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return inference_time, frames

In [44]:
def with_opencv(filename):
    video = cv2.VideoCapture(filename)

    duration = video.get(cv2.CAP_PROP_POS_MSEC)
    frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)

    return duration, frame_count


timing_rnn = 0
timing_gru = 0
timing_lstm = 0
for i in range(20):
  test_video = np.random.choice(test_df["video_name"].values.tolist())
  inference_time_rnn, test_frames = sequence_prediction(test_video, MobileNet, RNN)
  inference_time_gru, test_frames = sequence_prediction(test_video, MobileNet, GRU)
  inference_time_lstm, test_frames = sequence_prediction(test_video, MobileNet, LSTM)
  filename = f"/content/drive/MyDrive/DangerousDrivingRecognition/test/{test_video}"
  duration, frame_count = with_opencv(filename)
  timing_rnn += ((inference_time_rnn) / frame_count) * 1000
  timing_gru += ((inference_time_gru) / frame_count) * 1000
  timing_lstm += ((inference_time_lstm) / frame_count) * 1000
timing_rnn /= 20
timing_gru /= 20
timing_lstm /= 20
print(f"\n\n\n\nRNN Processing of each frame: {timing_rnn}\n")
print(f"GRU Processing of each frame: {timing_gru}\n")
print(f"LSTM Processing of each frame: {timing_lstm}\n\n")


_, accuracy = RNN.evaluate([test_data[0], ], test_labels)
print(f"RNN Test Accuracy: {round(accuracy * 100, 2)}%\n")
_, accuracy = GRU.evaluate([test_data[0], ], test_labels)
print(f"GRU Test Accuracy: {round(accuracy * 100, 2)}%\n")
_, accuracy = LSTM.evaluate([test_data[0], ], test_labels)
print(f"LSTM Test Accuracy: {round(accuracy * 100, 2)}%\n")

Inference Time: 0.0781712532043457
  Dangerous: 99.80%
  Safe:  0.20%
Inference Time: 0.08265042304992676
  Safe: 58.35%
  Dangerous: 41.65%
Inference Time: 0.0708005428314209
  Dangerous: 58.83%
  Safe: 41.17%
Inference Time: 0.11215829849243164
  Dangerous: 99.80%
  Safe:  0.20%
Inference Time: 0.08005118370056152
  Safe: 58.35%
  Dangerous: 41.65%
Inference Time: 0.07982420921325684
  Dangerous: 58.83%
  Safe: 41.17%
Inference Time: 0.08401799201965332
  Dangerous: 99.80%
  Safe:  0.20%
Inference Time: 0.09575557708740234
  Safe: 58.35%
  Dangerous: 41.65%
Inference Time: 0.08109521865844727
  Dangerous: 58.83%
  Safe: 41.17%
Inference Time: 0.08296847343444824
  Dangerous: 99.80%
  Safe:  0.20%
Inference Time: 0.09009432792663574
  Safe: 58.35%
  Dangerous: 41.65%
Inference Time: 0.0891568660736084
  Dangerous: 58.83%
  Safe: 41.17%
Inference Time: 0.07462310791015625
  Dangerous: 99.80%
  Safe:  0.20%
Inference Time: 0.0945436954498291
  Safe: 58.35%
  Dangerous: 41.65%
Inference 