### State Farm Distracted Driving: Classifying images based on driver safety

See Also: https://www.kaggle.com/praveenmaripeti/state-farm-distracted-driver-detection-with-keras for NN implemntation with tensorflow

Distracted driving causes a lot of accidents and is 100% preventable. Machine learning algorithms can gage driver safety using 2D dashboard camera images of drivers. The goal is to classify these images based on driver's behavior (cell phone, texting, etc). 

10 behaviors are classfied as follows:

In [1]:
classes = ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9']
#['c'+str(x) for x in range(10)]

class_def = {'c0': 'safe driving',
'c1': 'texting - right',
'c2': 'talking on the phone - right',
'c3': 'texting - left',
'c4': 'talking on the phone - left',
'c5': 'operating the radio',
'c6': 'drinking',
'c7': 'reaching behind',
'c8': 'hair and makeup',
'c9': 'talking to passenger'}

In [2]:
# Import Relevant Libaries
import os
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import random
#Create and load dataset
import h5py
#Train Test Split
from sklearn.model_selection import train_test_split
#SVM
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#Learning Curve



In [3]:
# Some parameters for running the Notebook
# hdf5 filename 
hdf5_train = "StateFarm_Train.h5"
#write image dataset? 
write_dataset = False

#run grid search for svm hyperparameters?
run_gridsearch = False

### Make and/or  Load HDF5 dataset (training data) :
See: https://realpython.com/storing-images-in-python/
and the [github Page](https://github.com/realpython/materials/blob/storing-images/storing-images/storing_images.ipynb)


In [4]:
# directories and filepaths
data_dir = Path(os.path.join(os.getcwd(),'data'))
base_dir = Path(os.path.join(data_dir, "state-farm-distracted-driver-detection"))
img_folder = Path(os.path.join(base_dir, 'imgs'))
train_imgs = Path(os.path.join(img_folder, 'train'))
test_imgs = Path(os.path.join(img_folder, 'test'))
# load the image lists
driver_imgs_list = pd.read_csv(os.path.join(base_dir, 'driver_imgs_list.csv'))
sample_sub = pd.read_csv(os.path.join(base_dir, 'sample_submission.csv'))

In [5]:
# head of image list
driver_imgs_list.head()

Unnamed: 0,subject,classname,img
0,p002,c0,img_44733.jpg
1,p002,c0,img_72999.jpg
2,p002,c0,img_25094.jpg
3,p002,c0,img_69092.jpg
4,p002,c0,img_92629.jpg


In [6]:
def rescale_image(filepath, resize_scale=2, gray_scale=True):
  '''
  Loads image, converts to grayscale, downsamples by resize_scale and returns a 
  To keep color image, set gray_scale = True 
  To keep the original size of the image, set resize_scale= 1
  '''
  im = Image.open(filepath)
  if gray_scale:
    im = im.convert('L')
  if resize_scale > 1:
    resize_dims = tuple([int(x/resize_scale) for x in (im.size)])
    im = im.resize(resize_dims)
  # makes array 1 X Px X Py for easier concatenation  
  return np.array(im).reshape(1, np.array(im).shape[0], np.array(im).shape[1])

if write_dataset: 
    # NOTE: USING NP ARRAYS IS MUCH SLOWER THAN DOING A LIST
    # For training images data set (could probably parallelize this but...)
    classnames = driver_imgs_list['classname'].values
    labels = [int(x[1]) for x in classnames]
    #filenames = driver_imgs_list['img'].values
    image_list = [] # make a list, then concatenate
    for i, file in enumerate(filenames):
        if i%1000  == 0:    
            print(f"adding {file} to list, i={i}") 
            
        path_to_file = Path(os.path.join(train_imgs, labels[i], file))        
        image_list.append(rescale_image(path_to_file, resize_scale = 2)) 
    # concatenate images into singe nd array  
    images = np.concatenate(image_list, axis= 0)

In [7]:
if write_dataset: 
    # each image is a row
    # data = images.reshape(images.shape[0], images.shape[1]*images.shape[2])
    
    print('Writing HDF5 file')
    # Create a new HDF5 file
    file = h5py.File(
        data_dir / hdf5_train, "a"
    )

    # Create a dataset in the file
    image_set = file.create_dataset(
        "images",
        np.shape(images),
        h5py.h5t.STD_U8BE,
        data=images,
    )
    classnames_set = file.create_dataset(
        "labels",
        np.shape(labels),
        h5py.h5t.STD_U8BE,
        data=labels,
    )    
    
    file.close()           

Load hdf5 training dataset:


In [8]:
images, labels = [], []

# Open the HDF5 file
file = h5py.File(data_dir / hdf5_train, "r+")
images = np.array(file["/images"]).astype("uint8")
labels = np.array(file["/labels"]).astype("uint8")
file.close()

In [9]:
X = images.reshape(images.shape[0], images.shape[1]*images.shape[2])
y = labels

Train, test split

In [10]:
xtrain, xcv, ytrain, ycv = train_test_split(X, y, test_size=0.3, random_state=42)

### SVM:

In [11]:
# initiate pipeline (using parameters from week 2: 2_4 Support Vector Machines to start)
mypca = PCA(n_components=150, whiten=True, random_state= 42)
mysvm = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(mypca, mysvm)

In [27]:
if run_gridsearch:
    param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005, .01]}
    # Instantiate the grid search with the model and parameter grid
    grid = GridSearchCV(estimator=model, param_grid=param_grid)
    grid.fit(xtrain, ytrain)

    print(grid.best_params_)
    #print(grid.cv_results_)
    
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    for mean, std, param in zip(means, stds, grid.cv_results_['params']):
        print("%0.5f (+/-%0.05f) for %r"
              % (mean, std, param))
    # STILL WANNA PICK 0.99433 (+/-0.00037) for {'svc__C': 10, 'svc__gamma': 0.005} (SMALLER STD)

0.88086 (+/-0.00262) for {'svc__C': 1, 'svc__gamma': 0.0001}
0.94177 (+/-0.00371) for {'svc__C': 1, 'svc__gamma': 0.0005}
0.96292 (+/-0.00214) for {'svc__C': 1, 'svc__gamma': 0.001}
0.99051 (+/-0.00013) for {'svc__C': 1, 'svc__gamma': 0.005}
0.99363 (+/-0.00053) for {'svc__C': 1, 'svc__gamma': 0.01}
0.93578 (+/-0.00569) for {'svc__C': 5, 'svc__gamma': 0.0001}
0.96955 (+/-0.00149) for {'svc__C': 5, 'svc__gamma': 0.0005}
0.98006 (+/-0.00124) for {'svc__C': 5, 'svc__gamma': 0.001}
0.99427 (+/-0.00045) for {'svc__C': 5, 'svc__gamma': 0.005}
0.99452 (+/-0.00051) for {'svc__C': 5, 'svc__gamma': 0.01}
0.95094 (+/-0.00298) for {'svc__C': 10, 'svc__gamma': 0.0001}
0.97490 (+/-0.00175) for {'svc__C': 10, 'svc__gamma': 0.0005}
0.98496 (+/-0.00148) for {'svc__C': 10, 'svc__gamma': 0.001}
0.99433 (+/-0.00037) for {'svc__C': 10, 'svc__gamma': 0.005}
0.99452 (+/-0.00051) for {'svc__C': 10, 'svc__gamma': 0.01}
0.96693 (+/-0.00231) for {'svc__C': 50, 'svc__gamma': 0.0001}
0.98324 (+/-0.00117) for {'svc

In [28]:
# WITH PICKED PARAMETERS: 
mypca = PCA(n_components=150, whiten=True, random_state= 42)
mysvm = SVC(kernel='rbf', C=10, gamma=0.005, class_weight='balanced')
model = make_pipeline(mypca, mysvm)