In [2]:
import os
import re
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import random
#Create and load dataset
import h5py

#SVM
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#Learning Curve
from sklearn.model_selection import learning_curve

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# metrics
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision

In [3]:
# directories and filepaths
data_dir = Path(os.path.join(os.getcwd(),'data'))
base_dir = Path(os.path.join(data_dir, "state-farm-distracted-driver-detection"))
img_folder = Path(os.path.join(base_dir, 'imgs'))
train_imgs = Path(os.path.join(img_folder, 'train'))
test_imgs = Path(os.path.join(img_folder, 'test'))
# load the image lists
df = pd.read_csv(os.path.join(base_dir, 'driver_imgs_list.csv'))
sample_sub = pd.read_csv(os.path.join(base_dir, 'sample_submission.csv'))

In [4]:
# hdf5 filename 

# CHANGED FILENAMES TO INCLUDE GRAY OR COLOR 
#(THIS WILL BE A DIFF WITH THE LEAKAGE DATASETS THAT DIDNT HAVE DRIVER INFO)
downscale = 2
gray_scale=True
if gray_scale:
    hdf5_train = f"StateFarm_Train_Gray_{downscale}X.h5"
else:
    hdf5_train = f"StateFarm_Train_Color_{downscale}X.h5"

if gray_scale:    
    hdf5_test = f"StateFarm_Test_Gray_{downscale}X.h5"
else:
    hdf5_test = f"StateFarm_Test_Color_{downscale}X.h5"
#"StateFarm_Train_2X.h5" # scaled down by 2
#"StateFarm_Train_5X.h5" # scaled down by 5 

Functions to split on driver

In [None]:
str2int = lambda istr: int(re.findall('\d+', istr)[0])

def split_on_driver(df, train_split=.8,  seed=0):
    # splits dataframe based on drivers
    # take in dataframe (original), train_split proportion,  random seed
    # returns new dataframe, train_drivers,  test_drivers
    
    # if dataframe doesn't already have driver, addit 
    if 'driver' not in df.columns:
        df['driver'] =  df.subject.apply(str2int) # SID FOR SUBJECT ID
        df['class'] = df.classname.apply(str2int) # 
   
    # random number generator
    rng = np.random.default_rng(seed=seed) 
    
    # unique drivers and number of drivers
    drivers = np.unique(df['driver'].values)
    ndrivers = len(drivers)
    
    # number of data pts per driver 
    nPerDriver = df.groupby('driver').count().values
    nPerDriver = nPerDriver[:,0]
    
    # shuffle drivers (get shuffle indices)
    shuff_idx  = rng.permutation(ndrivers)
    
    # shuffle the drivers and the nPerDriver according to shuffle indices
    drivers = drivers[shuff_idx]
    nPerDriver = nPerDriver[shuff_idx]
    
    # separate drivers according to train_split
    train_log = np.cumsum(nPerDriver) < (22424*train_split) # really close to 20/80 plit (e.g. splits at 17891, 80% is 17939)
    test_log = np.logical_not(train_log)
    train_drivers = drivers[train_log]
    test_drivers = drivers[test_log]
    
    return (df, train_drivers, test_drivers, nPerDriver)

In [6]:
# load data set
# load the dataframe
df = pd.read_csv(os.path.join(base_dir, 'driver_imgs_list.csv'))
# Open the HDF5 file
file = h5py.File(data_dir / hdf5_train, "r+")
images = np.array(file["/images"]).astype("uint8")
classes = np.array(file["/c"]).astype("uint8")
drivers = np.array(file["/driver"]).astype("uint8")

In [7]:
file.close()

In [8]:
# make data matrix X 
image_size = images.shape
# Resize images as vectors
X = images.reshape(images.shape[0], images.shape[1]*images.shape[2])
# Zero mean 
X = X - X.mean(axis=1).reshape(X.shape[0], 1)

# split the training and validation data
df, train_drivers, test_drivers, nPerDriver = split_on_driver(df, train_split=.8,  seed=0)
train_log = [d in train_drivers for d in drivers]
test_log = [d in test_drivers for d in drivers]

Xtrain = X[train_log, ...]
Ytrain = classes[train_log]

Xval = X[test_log,...]
Yval = classes[test_log] 

In [8]:
# train using the parameters used before with the whole dataset
mypca = PCA(n_components=160, whiten=True, random_state= 42)
mysvm = SVC(C=5, gamma=0.01,kernel='rbf', class_weight='balanced')
model = make_pipeline(mypca, mysvm)
# fit the model to trianing data
history = model.fit(Xtrain, Ytrain)

In [9]:
# predict cross validation set
ypred_val = model.predict(Xval)

In [10]:
print(classification_report(Yval, ypred_val))

              precision    recall  f1-score   support

           0       0.22      0.54      0.32       471
           1       0.81      0.25      0.38       452
           2       0.99      0.18      0.30       485
           3       0.60      0.47      0.52       475
           4       0.97      0.08      0.14       479
           5       0.90      0.94      0.92       489
           6       0.95      0.42      0.58       474
           7       0.97      0.57      0.72       401
           8       0.19      0.83      0.30       393
           9       0.73      0.13      0.22       432

    accuracy                           0.43      4551
   macro avg       0.73      0.44      0.44      4551
weighted avg       0.74      0.43      0.44      4551



For 5x downscaled images (gray scale) <br>
`mypca = PCA(n_components=160, whiten=True, random_state= 42)` <br>
`mysvm = SVC(C=5, gamma=0.01,kernel='rbf', class_weight='balanced')`
 
 
 precision    recall  f1-score   support

           0       0.24      0.47      0.32       471
           1       0.75      0.22      0.34       452
           2       0.99      0.16      0.27       485
           3       0.67      0.41      0.51       475
           4       1.00      0.07      0.13       479
           5       0.88      0.89      0.89       489
           6       0.97      0.32      0.49       474
           7       0.72      0.55      0.62       401
           8       0.16      0.85      0.28       393
           9       0.78      0.19      0.30       432

    accuracy                           0.41      4551
`  macro avg       0.72      0.41      0.41      4551` <br>
`weighted avg       0.73      0.41      0.42      4551` 

In [None]:
# The performance looks a lot worse now. How to improve it? 
# Phones are small, so it makes sense to use larger images.. Running again for 2x downscaled

For 2x scaled images (same pars as above)
   precision    recall  f1-score   support

           0       0.22      0.54      0.32       471
           1       0.81      0.25      0.38       452
           2       0.99      0.18      0.30       485
           3       0.60      0.47      0.52       475
           4       0.97      0.08      0.14       479
           5       0.90      0.94      0.92       489
           6       0.95      0.42      0.58       474
           7       0.97      0.57      0.72       401
           8       0.19      0.83      0.30       393
           9       0.73      0.13      0.22       432

    accuracy                           0.43      4551 
`   macro avg       0.73      0.44      0.44      4551`<br>
`weighted avg       0.74      0.43      0.44      4551`


In [14]:
# We get a slight improvement from using a larger image, but it makes calculating the npcs harder

(17873, 76800)

In [None]:
# let's try SV3M, first with default parameters

In [20]:
from semisupervised import S3VM

https://libraries.io/pypi/semisupervised

In [9]:
rng = np.random.RandomState(42)
random_unlabeled_points = rng.rand(len(Xtrain)) < 0.1

Ytrain[random_unlabeled_points] = -1

index, = np.where(Ytrain != -1)
label_X_train = Xtrain[index,:]
label_y_train = Ytrain[index]

In [10]:
index, = np.where(Ytrain == -1)
unlabel_X_train = Xtrain[index,:]
unlabel_y = -1*np.ones(unlabel_X_train.shape[0]).astype(int)

In [25]:


model = S3VM()
model.fit(np.vstack((label_X_train, unlabel_X_train)), np.append(label_y_train, unlabel_y))
# predict
predict = model.predict(Xval)
acc = metrics.accuracy_score(Yval, predict)
# metric
print("accuracy", acc)
print(classification_report(Yval, predict))

MemoryError: Unable to allocate 10.2 GiB for an array with shape (17873, 76800) and data type float64

In [12]:
# let's try fitting with sklearn's version
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.svm import SVC
svc = SVC(probability=True, gamma="auto")
self_training_model = SelfTrainingClassifier(svc)

xtrain = np.vstack((label_X_train, unlabel_X_train))


NameError: name 'y_train' is not defined

In [13]:
ytrain = np.append(label_y_train, unlabel_y)

self_training_model.fit(xtrain, ytrain)

predict = self_training_model.predict(Xval)
acc = metrics.accuracy_score(Yval, predict)
# metric
print("accuracy", acc)
print(classification_report(Yval, predict))



MemoryError: Unable to allocate 10.2 GiB for an array with shape (17873, 76800) and data type float64