# Distraction Driver Detection Project

> In this notebook, I'll use the dataset which includes images of drivers while performing a number of tasks including drinking, texting etc. The aim is to correctly identify if the driver is distracted from driving. We might also like to check what activity the person is performing.

The notebook will be borken into the following steps:

0. Import the Libraries.
1. Import the Datasets.
2. Create a vanilla CNN model.
3. Create a vanilla CNN model with data augmentation.
4. Train a CNN with Transfer Learning (VGG16).
5. Kaggle Results.


## Import the Libraries

I'll use Keras and Tensorflow libraries to create a **Convolutional Neural Network**. So, I'll import the necessary libraries to do the same.

In [None]:
import os
from glob import glob
import random
import time
import tensorflow
import datetime
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 3 = INFO, WARNING, and ERROR messages are not printed

from tqdm import tqdm

import numpy as np
import pandas as pd
from IPython.display import FileLink
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
%matplotlib inline
from IPython.display import display, Image
import matplotlib.image as mpimg
import cv2

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_files       
from keras.utils import np_utils
from sklearn.utils import shuffle
from sklearn.metrics import log_loss

from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.applications.vgg16 import VGG16


## Import the Datasets

I'll import the `.csv` file to read the labels.

In [None]:
dataset = pd.read_csv('/kaggle/input/imageslist/driver_imgs_list.csv')
dataset.head(5)

From the csv file, I'll use the `classname` as the labels for the images and use the image names to match the labels with the correct images.

#### Import Driver Dataset

In [None]:
by_drivers = dataset.groupby('subject')
unique_drivers = by_drivers.groups.keys()
print(unique_drivers)

In the following, I prepare the code to import the dataset of the driver images. then populate a few variables.

In [None]:
# Load the dataset previously downloaded from Kaggle
NUMBER_CLASSES = 10
# Col1 or type: 1- grey, 3 - rgb

def get_cv2_image(path, img_rows, img_cols, color_type=3):
    # Loading as Grayscale image
    if color_type == 1:
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    elif color_type == 3:
        img = cv2.imread(path, cv2.IMREAD_COLOR)
    # Reduce size
    img = cv2.resize(img, (img_rows, img_cols)) 
    return img

# Training
def load_train(img_rows, img_cols, color_type=3):
    start_time = time.time()
    train_images = [] 
    train_labels = []
    # Loop over the training folder 
    for classed in tqdm(range(NUMBER_CLASSES)):
#         /kaggle/input/state-farm-distracted-driver-detection/imgs/train
        print(f'Loading directory c{classed}')
        class_path = os.path.join('/kaggle/input/state-farm-distracted-driver-detection/imgs/train', f'c{classed}')
        
        # Check if the directory exists
        if not os.path.exists(class_path):
            print(f"Directory c{classed} does not exist.")
            continue
        
        files = glob(os.path.join(class_path, '*.jpg'))
        
    
#         print('Loading directory c{}'.format(classed))
#         files = glob(os.path.join('..','kaggle', 'input', 'state-farm-distracted-driver-detection','imgs','train', 'c' + str(classed), '*.jpg'))
#         files = glob(os.path.join(class_path, '*.jpg'))    
        
        for file in files:
            img = get_cv2_image(file, img_rows, img_cols, color_type)
            train_images.append(img)
            train_labels.append(classed)
    print("Data Loaded in {} second".format(time.time() - start_time))
    return train_images, train_labels 

def read_and_normalize_train_data(img_rows, img_cols, color_type):
    X, labels = load_train(img_rows, img_cols, color_type)
    y = np_utils.to_categorical(labels, 10)   #Binary Matrix  (2,10)  [0 0 1 0 0 0 0 0 0 ]
#     print(X)
#     print(y)
    
#     X, labels = load_train(img_rows, img_cols, color_type)
#     y = np_utils.to_categorical(labels, 10)
#     print("Number of samples:", len(X))
#     print("Number of labels:", len(labels))
#     print(y)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True)

    
    x_train = np.array(x_train, dtype=np.uint8).reshape(-1,img_rows,img_cols,color_type)
    x_test = np.array(x_test, dtype=np.uint8).reshape(-1,img_rows,img_cols,color_type)
    
    return x_train, x_test, y_train, y_test



# Validation
def load_test(size=200000, img_rows=64, img_cols=64, color_type=3):
    path = '/kaggle/input/state-farm-distracted-driver-detection/imgs/test/*.jpg'
    files = sorted(glob(path))
    X_test, X_test_id = [], []
    total = 0
    files_size = len(files)
    for file in tqdm(files):
        if total >= size or total >= files_size:
            break
        file_base = os.path.basename(file)
        img = get_cv2_image(file, img_rows, img_cols, color_type)
        X_test.append(img)
        X_test_id.append(file_base)
        total += 1
    return X_test, X_test_id

def read_and_normalize_sampled_test_data(size, img_rows, img_cols, color_type=3):
    test_data, test_ids = load_test(size, img_rows, img_cols, color_type)
    
    test_data = np.array(test_data, dtype=np.uint8)
    test_data = test_data.reshape(-1,img_rows,img_cols,color_type)
    
    return test_data, test_ids

In [None]:
img_rows = 64
img_cols = 64
color_type = 1

In [None]:

x_train, x_test, y_train, y_test = read_and_normalize_train_data(img_rows, img_cols, color_type)

print('Train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')



In [None]:
nb_test_samples = 200
test_files, test_targets = read_and_normalize_sampled_test_data(nb_test_samples, img_rows, img_cols, color_type)
print('Test shape:', test_files.shape)
print(test_files.shape[0], 'Test samples')

#### Statistics

In [None]:
# Assuming x_train, x_test, and x_validation are your loaded datasets

# Statistics
# Load the list of names
names = [item[17:19] for item in sorted(glob("/kaggle/input/state-farm-distracted-driver-detection/imgs/train/*/"))]
test_files_size = len(np.array(glob(os.path.join('/kaggle/input/state-farm-distracted-driver-detection/imgs/test', '*.jpg'))))
x_train_size = len(x_train)
categories_size = len(names)
x_test_size = len(x_test)
print('There are %s total images.\n' % (test_files_size + x_train_size + x_test_size))
print('There are %d training images.' % x_train_size)
print('There are %d total training categories.' % categories_size)
print('There are %d validation images.' % x_test_size)
print('There are %d test images.' % test_files_size)


#### Dataset Visualization

In [None]:
# Plot figure size
plt.figure(figsize = (10,10))
# Count the number of images per category
sns.countplot(x = 'classname', data = dataset)
# Change the Axis names
plt.ylabel('Count')
plt.title('Categories Distribution')
# Show plot
plt.show()

In [None]:
# Find the frequency of images per driver
drivers_id = pd.DataFrame((dataset['subject'].value_counts()).reset_index())
drivers_id.columns = ['driver_id', 'Counts']
drivers_id

In [None]:
# Plotting class distribution
dataset['class_type'] = dataset['classname'].str.extract('(\d)',expand=False).astype(np.float)
plt.figure(figsize = (20,20))
dataset.hist('class_type', alpha=0.5, layout=(1,1), bins=10)
plt.title('Class distribution')
plt.show()

#### Images overview

Let's take a look at the various images in the dataset. I'll plot an image for each of the 10 classes. As the directory names are not descriptive, I'll use a map to define the title for each image that is more descriptive.

In [None]:
activity_map = {'c0': 'Safe driving', 
                'c1': 'Texting - right', 
                'c2': 'Talking on the phone - right', 
                'c3': 'Texting - left', 
                'c4': 'Talking on the phone - left', 
                'c5': 'Operating the radio', 
                'c6': 'Drinking', 
                'c7': 'Reaching behind', 
                'c8': 'Hair and makeup', 
                'c9': 'Talking to passenger'}

In [None]:
plt.figure(figsize=(12, 20))

BASE_URL = '/kaggle/input/state-farm-distracted-driver-detection/imgs/train/'
image_count = 1

# Assuming you have an activity_map dictionary defined
activity_map = {'c0': 'Safe Driving', 'c1': 'Texting - Right', 'c2': 'Talking on the Phone - Right', 'c3': 'Texting - Left', 'c4': 'Talking on the Phone - Left', 'c5': 'Operating the Radio', 'c6': 'Drinking', 'c7': 'Reaching Behind', 'c8': 'Hair and Makeup', 'c9': 'Talking to Passenger'}

for directory in os.listdir(BASE_URL):
    if directory[0] != '.':
        files_path = os.path.join(BASE_URL, directory)
        files = os.listdir(files_path)
        for i, file in enumerate(files):
            if i == 1:
                break
            else:
                plt.subplot(5, 2, image_count)
                image_count += 1
                image_path = os.path.join(files_path, file)
                image = mpimg.imread(image_path)
                plt.imshow(image)
                plt.title(activity_map.get(directory, directory))

plt.show()

In [None]:
def create_submission(predictions, test_id, info):
    result = pd.DataFrame(predictions, columns=['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'])
    result.loc[:, 'img'] = pd.Series(test_id, index=result.index)
    
    now = datetime.datetime.now()
    
    if not os.path.isdir('kaggle_submissions'):
        os.mkdir('kaggle_submissions')

    suffix = "{}_{}".format(info,str(now.strftime("%Y-%m-%d-%H-%M")))
    sub_file = os.path.join('kaggle_submissions', 'submission_' + suffix + '.csv')
    
    result.to_csv(sub_file, index=False)
    
    return sub_file

## Create a vanilla CNN model

#### Building the model

I'll develop the model with a total of 4 Convolutional layers, then a Flatten layer and then 2 Dense layers. I'll use the optimizer as `rmsprop`, and loss as `categorical_crossentropy`.

In [None]:
batch_size = 40
nb_epoch = 10

In [None]:
!rm -f saved_models/weights_best_vanilla.hdf5  #Removing the file from s_m

In [None]:
models_dir = "saved_models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    
checkpointer = ModelCheckpoint(filepath='saved_models/weights_best_vanilla.hdf5', 
                               monitor='val_loss', mode='min',
                               verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
callbacks = [checkpointer, es]

#### Model v1
---

In [None]:
def create_model_v1():
    # Vanilla CNN model
    model = Sequential()

    model.add(Conv2D(filters = 64, kernel_size = 3, padding='same', activation = 'relu', input_shape=(img_rows, img_cols, color_type)))
    model.add(MaxPooling2D(pool_size = 2))

    model.add(Conv2D(filters = 128, padding='same', kernel_size = 3, activation = 'relu'))
    model.add(MaxPooling2D(pool_size = 2))

    model.add(Conv2D(filters = 256, padding='same', kernel_size = 3, activation = 'relu'))
    model.add(MaxPooling2D(pool_size = 2))

    model.add(Conv2D(filters = 512, padding='same', kernel_size = 3, activation = 'relu'))
    model.add(MaxPooling2D(pool_size = 2))

    model.add(Dropout(0.5))

    model.add(Flatten())

    model.add(Dense(500, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation = 'softmax'))
    
    return model

In [None]:
model_v1 = create_model_v1()

# More details about the layers
model_v1.summary()

# Compiling the model
model_v1.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
#  Root Mean Square Propagation

In [None]:
# Training the Vanilla Model version 1
history_v1 = model_v1.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          callbacks=callbacks,
          epochs=nb_epoch, batch_size=batch_size, verbose=1)

Load the classifier with the best validation loss

In [None]:
model_v1.load_weights('saved_models/weights_best_vanilla.hdf5')

In [None]:
def plot_train_history(history):
    # Summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

    # Summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

In [None]:
plot_train_history(history_v1)

In [None]:
def plot_test_class(model, test_files, image_number, color_type=1):
    img_brute = test_files[image_number]
    img_brute = cv2.resize(img_brute,(img_rows,img_cols))
    plt.imshow(img_brute, cmap='gray')

    new_img = img_brute.reshape(-1,img_rows,img_cols,color_type)

    y_prediction = model.predict(new_img, batch_size=batch_size, verbose=1)
    print('Y prediction: {}'.format(y_prediction))
    print('Predicted: {}'.format(activity_map.get('c{}'.format(np.argmax(y_prediction)))))
    
    plt.show()

In [None]:
score = model_v1.evaluate(x_test, y_test, verbose=1)
print('Score: ', score)

In [None]:
plot_test_class(model_v1, test_files, 20)

In [None]:
!rm -f saved_models/weights_best_vanilla.hdf5

#### Optimised Vanilla CNN Model 

#### Model v2
---

In [None]:
def create_model_v2():
    # Optimised Vanilla CNN model
    model = Sequential()

    ## CNN 1
    model.add(Conv2D(32,(3,3),activation='relu',input_shape=(img_rows, img_cols, color_type)))
    model.add(BatchNormalization())
    model.add(Conv2D(32,(3,3),activation='relu',padding='same'))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    model.add(Dropout(0.3))

    ## CNN 2
    model.add(Conv2D(64,(3,3),activation='relu',padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(64,(3,3),activation='relu',padding='same'))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    model.add(Dropout(0.3))

    ## CNN 3
    model.add(Conv2D(128,(3,3),activation='relu',padding='same'))
    model.add(BatchNormalization())
    model.add(Conv2D(128,(3,3),activation='relu',padding='same'))
    model.add(BatchNormalization(axis = 3))
    model.add(MaxPooling2D(pool_size=(2,2),padding='same'))
    model.add(Dropout(0.5))

    ## Output
    model.add(Flatten())
    model.add(Dense(512,activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(128,activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(10,activation='softmax'))

    return model

In [None]:
model_v2 = create_model_v2()

# More details about the layers
model_v2.summary()

# Compiling the model
model_v2.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training the Vanilla Model
history_v2 = model_v2.fit(x_train, y_train, 
          validation_data=(x_test, y_test),
          callbacks=callbacks,
          epochs=nb_epoch, batch_size=batch_size, verbose=1)

In [None]:
plot_train_history(history_v2)

In [None]:
model_v2.load_weights('saved_models/weights_best_vanilla.hdf5')

In [None]:
score = model_v2.evaluate(x_test, y_test, verbose=1)
print('Score: ', score)

y_pred = model_v2.predict(x_test, batch_size=batch_size, verbose=1)
score = log_loss(y_test, y_pred)  #Entropy loss
print('Score log loss:', score)

In [None]:
plot_test_class(model_v2, test_files, 101) # The model really performs badly

In [None]:
plot_test_class(model_v2, test_files, 1) # The model really performs badly

In [None]:
plot_test_class(model_v2, test_files, 143) 

## Create a vanilla CNN model with data augmentation

Here I'm augmenting the previous model `classifier`, I'll use the data on which I want to train the model. The folder `train` includes the images I need. I'll generate more images using **ImageDataGenerator** and split the training data into 80% train and 20% validation split.

In [None]:
!rm -f saved_models/weights_best_vanilla.hdf5

In [None]:
# Prepare data augmentation configuration
train_datagen = ImageDataGenerator(rescale = 1.0/255, 
                                   shear_range = 0.2, 
                                   zoom_range = 0.2, 
                                   horizontal_flip = True, 
                                   validation_split = 0.2)

test_datagen = ImageDataGenerator(rescale=1.0/ 255, validation_split = 0.2)

In [None]:
nb_train_samples = x_train.shape[0]
nb_validation_samples = x_test.shape[0]
print(nb_train_samples)
print(nb_validation_samples)
training_generator = train_datagen.flow(x_train, y_train, batch_size=batch_size)
validation_generator = test_datagen.flow(x_test, y_test, batch_size=batch_size)  #Applies the specified no of set at a time

#### Train the model with Data Augmentation

Using `fit_generator`, I'll train the model.

In [None]:
checkpoint = ModelCheckpoint('saved_models/weights_best_vanilla.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
history_v3 = model_v2.fit_generator(training_generator,
                         steps_per_epoch = nb_train_samples // batch_size,
                         epochs = 5, 
                         callbacks=[es, checkpoint],
                         verbose = 1,
                         validation_data = validation_generator,
                         validation_steps = nb_validation_samples // batch_size)

In [None]:
model_v2.load_weights('saved_models/weights_best_vanilla.hdf5')

In [None]:
plot_train_history(history_v3)

In [None]:
# Evaluate the performance of the new model
score = model_v2.evaluate_generator(validation_generator, nb_validation_samples // batch_size)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
plot_test_class(model_v2, test_files, 101)

In [None]:
plot_test_class(model_v2, test_files, 1) 

In [None]:
plot_test_class(model_v2, test_files, 145) 

In [None]:
plot_test_class(model_v2, test_files, 143) 

In [None]:
predictions = model_v2.predict(test_files, batch_size=batch_size)
FileLink(create_submission(predictions, test_targets, score[0]))

The trained model achieved a validation accuracy of over 93%.

In [None]:
!rm -f saved_models/weights_best_vanilla.hdf5

## Train a CNN with Transfer Learning (VGG, MobileNet)

To reduce training time without sacrificing accuracy, I'll train a CNN using **transfer learning**.

In [None]:
def vgg_std16_model(img_rows, img_cols, color_type=3):
    nb_classes = 10
    # Remove fully connected layer and replace
    # with softmax for classifying 10 classes
    vgg16_model = VGG16(weights="imagenet", include_top=False)

    # Freeze all layers of the pre-trained model
    for layer in vgg16_model.layers:
        layer.trainable = False
        
    x = vgg16_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    predictions = Dense(nb_classes, activation = 'softmax')(x)

    model = Model(input = vgg16_model.input, output = predictions)
    
    return model

In [None]:
# Load the VGG16 network
print("Loading network...")
model_vgg16 = vgg_std16_model(img_rows, img_cols)

model_vgg16.summary()

model_vgg16.compile(loss='categorical_crossentropy',
                         optimizer='rmsprop',
                         metrics=['accuracy'])

In [None]:
training_generator = train_datagen.flow_from_directory('/kaggle/input/state-farm-distracted-driver-detection/imgs/train', 
                                                 target_size = (img_rows, img_cols), 
                                                 batch_size = batch_size,
                                                 shuffle=True,
                                                 class_mode='categorical', subset="training")

validation_generator = test_datagen.flow_from_directory('/kaggle/input/state-farm-distracted-driver-detection/imgs/train', 
                                                   target_size = (img_rows, img_cols), 
                                                   batch_size = batch_size,
                                                   shuffle=False,
                                                   class_mode='categorical', subset="validation")
nb_train_samples = 17943
nb_validation_samples = 4481

In [None]:
!rm -f saved_models/weights_best_vgg16.hdf5

In [None]:
# Training the Vanilla Model
checkpoint = ModelCheckpoint('saved_models/weights_best_vgg16.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
history_v4 = model_vgg16.fit_generator(training_generator,
                         steps_per_epoch = nb_train_samples // batch_size,
                         epochs = 5, 
                         callbacks=[es, checkpoint],
                         verbose = 1,
                         class_weight='auto',
                         validation_data = validation_generator,
                         validation_steps = nb_validation_samples // batch_size)

In [None]:
model_vgg16.load_weights('saved_models/weights_best_vgg16.hdf5')

In [None]:
plot_train_history(history_v4)

In [None]:
def plot_vgg16_test_class(model, test_files, image_number):
    img_brute = test_files[image_number]

    im = cv2.resize(cv2.cvtColor(img_brute, cv2.COLOR_BGR2RGB), (img_rows,img_cols)).astype(np.float32) / 255.0
    im = np.expand_dims(im, axis =0)

    img_display = cv2.resize(img_brute,(img_rows,img_cols))
    plt.imshow(img_display, cmap='gray')

    y_preds = model.predict(im, batch_size=batch_size, verbose=1)
    print(y_preds)
    y_prediction = np.argmax(y_preds)
    print('Y Prediction: {}'.format(y_prediction))
    print('Predicted as: {}'.format(activity_map.get('c{}'.format(y_prediction))))
    
    plt.show()

In [None]:
plot_vgg16_test_class(model_vgg16, test_files, 133) # Texting left

In [None]:
plot_vgg16_test_class(model_vgg16, test_files, 29) # Texting left

In [None]:
plot_vgg16_test_class(model_vgg16, test_files, 82) # Hair

In [None]:
# Evaluate the performance of the new model
score = model_vgg16.evaluate_generator(validation_generator, nb_validation_samples // batch_size, verbose = 1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])