In [80]:
# Discriminant Analysis
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import sklearn.discriminant_analysis as skl_da
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score,confusion_matrix,f1_score)


In [81]:
#Import data
data = pd.read_csv("../../machine_learning/siren_data_train.csv", sep=",")
print(f"Data size: {data.memory_usage().sum() / 1e6:.2f} MB")
print("The first 10 rows in the data:")



Data size: 0.59 MB
The first 10 rows in the data:


In [82]:
#Calculate distance to horn
list_loc_horn = []
list_loc_person = []
list_distance_to_horn = []

x_cor_horn = data["near_x"]
y_cor_horn = data["near_y"]

x_cor_person = data["xcoor"]
y_cor_person = data["ycoor"]

for row in range(len(x_cor_horn)):
    loc_horn = [x_cor_horn[row], y_cor_horn[row]]
    list_loc_horn.append(loc_horn)

    loc_person = [x_cor_person[row], y_cor_person[row]]
    list_loc_person.append(loc_person)

for i in range(len(x_cor_horn)):
    
    coordinate_horn = list_loc_horn[i] 
    coordinate_person = list_loc_person[i]
    distance_to_horn = math.dist(coordinate_horn,coordinate_person)
    list_distance_to_horn.append(distance_to_horn)
data["distance to nearest horn"] = list_distance_to_horn


In [83]:
#Splitting test and training data
np.random.seed(1)
trainI = np.random.choice(data.shape[0], size=4000, replace=False)
trainIndex = data.index.isin(trainI)
train = data.iloc[trainIndex] #training set
test = data.iloc[~trainIndex]

X_test = train[['building','noise', 'in_vehicle', 'asleep','age','distance to nearest horn','near_angle']]
Y_test = train['heard']
X_train = train[['building','noise', 'in_vehicle', 'asleep','age','distance to nearest horn','near_angle']]
Y_train = train['heard']



In [86]:
#Tuning model
model = skl_da.LinearDiscriminantAnalysis()
params = {'solver':['lsqr','eigen'],'shrinkage':[None,'auto',0.5]}
grid_search = GridSearchCV(model, params, cv=5, scoring='f1')
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
print(best_params)
f1_scores = grid_search.cv_results_['mean_test_score']
print('f1:',f1_scores )
params = grid_search.cv_results_['params']
print(params)
best_lda_model = grid_search.best_estimator_
accuracy = best_lda_model.score(X_test, Y_test)
print(accuracy)




{'shrinkage': None, 'solver': 'lsqr'}
f1: [0.9319853  0.9319853  0.9290485  0.9290485  0.92452398 0.92452398]
[{'shrinkage': None, 'solver': 'lsqr'}, {'shrinkage': None, 'solver': 'eigen'}, {'shrinkage': 'auto', 'solver': 'lsqr'}, {'shrinkage': 'auto', 'solver': 'eigen'}, {'shrinkage': 0.5, 'solver': 'lsqr'}, {'shrinkage': 0.5, 'solver': 'eigen'}]
0.89225


In [50]:
#predict 
predict_prob = best_lda_model.predict_proba(X_test)
print('The class order in the model:')
print(best_lda_model.classes_)
print('Examples of predicted probablities for the above classes:')
with np.printoptions(suppress=True, precision=3): # Supress scienti
    print(predict_prob[0:5]) # inspect the first 5 predictions

The class order in the model:
[0 1]
Examples of predicted probablities for the above classes:
[[0.006 0.994]
 [0.008 0.992]
 [0.016 0.984]
 [0.05  0.95 ]
 [0.055 0.945]]


In [34]:

prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob[:, 0]>=0.5, 0, 1)
print("First five predictions:")
print(prediction[0:5], '\n') # Inspect the first 5 predictions after labeling.
# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(prediction, Y_test),'\n')
# Accuracy
print(f"Accuracy: {np.mean(prediction == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,prediction):.2}")


First five predictions:
[1 1 1 1 1] 

Consufion matrix:
heard    0     1
row_0           
0      591    55
1      377  2977 

Accuracy: 0.892
F1:0.93
