In [3]:
# Discriminant Analysis
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import sklearn.discriminant_analysis as skl_da
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import (accuracy_score,confusion_matrix,f1_score, classification_report)


In [5]:
#Import data
data = pd.read_csv("../../machine_learning/siren_data_train.csv", sep=",")
print(f"Data size: {data.memory_usage().sum() / 1e6:.2f} MB")
print("The first 10 rows in the data:")


Data size: 0.59 MB
The first 10 rows in the data:


In [6]:
#Calculate distance to horn
list_loc_horn = []
list_loc_person = []
list_distance_to_horn = []

x_cor_horn = data["near_x"]
y_cor_horn = data["near_y"]

x_cor_person = data["xcoor"]
y_cor_person = data["ycoor"]

for row in range(len(x_cor_horn)):
    loc_horn = [x_cor_horn[row], y_cor_horn[row]]
    list_loc_horn.append(loc_horn)

    loc_person = [x_cor_person[row], y_cor_person[row]]
    list_loc_person.append(loc_person)

for i in range(len(x_cor_horn)):
    
    coordinate_horn = list_loc_horn[i] 
    coordinate_person = list_loc_person[i]
    distance_to_horn = math.dist(coordinate_horn,coordinate_person)
    list_distance_to_horn.append(distance_to_horn)
data["distance to nearest horn"] = list_distance_to_horn



" np.random.seed(1)\ntrainI = np.random.choice(data.shape[0], size=4283, replace=False)\ntrainIndex = data.index.isin(trainI)\ntrain = data.iloc[trainIndex] #training data\ntest = data.iloc[~trainIndex] #test data\n\nX_test = train[['building','noise','asleep','in_vehicle','no_windows','age','distance to nearest horn']]\nY_test = train['heard']\nX_train = train[['building','noise','asleep','in_vehicle','no_windows','age','distance to nearest horn']]\nY_train = train['heard'] "

In [14]:
#Splitting
x = data[['building','noise','asleep','in_vehicle','no_windows','age','distance to nearest horn']]
y = data['heard']
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.25, random_state = 1, shuffle = True)
print(Y_test)

4499    1
3499    1
2430    1
325     0
1967    1
       ..
2899    1
2752    1
4363    0
2270    1
4927    0
Name: heard, Length: 1428, dtype: int64


In [8]:
#Tuning model with GridSearchCV with parameters solver and shrinkage. Solver 'svd' does not support shrinkage and is therefore removed
model = skl_da.LinearDiscriminantAnalysis()
params = {'solver':['lsqr','eigen'],'shrinkage':[None,'auto',0.5]}
grid_search = GridSearchCV(model, params, cv=5, scoring='f1')
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
f1_scores = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
best_gridsearch_model = grid_search.best_estimator_
accuracy = best_gridsearch_model.score(X_test, Y_test)
print(accuracy)
print(best_params)
print(params)
print(f1_scores)


0.9124649859943977
{'shrinkage': None, 'solver': 'lsqr'}
[{'shrinkage': None, 'solver': 'lsqr'}, {'shrinkage': None, 'solver': 'eigen'}, {'shrinkage': 'auto', 'solver': 'lsqr'}, {'shrinkage': 'auto', 'solver': 'eigen'}, {'shrinkage': 0.5, 'solver': 'lsqr'}, {'shrinkage': 0.5, 'solver': 'eigen'}]
[0.94031286 0.94031286 0.9373965  0.9373965  0.92311039 0.92311039]


In [9]:
#Default model
model = skl_da.LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
accuracy = model.score(X_test, Y_test)
print(accuracy)

0.9124649859943977


In [10]:
#predict 
predict_prob = model.predict_proba(X_test)
print('The class order in the model:')
print(model.classes_)
print('Examples of predicted probablities for the above classes:')
with np.printoptions(suppress=True, precision=3): 
    print(predict_prob[0:5]) # First 5 predictions

The class order in the model:
[0 1]
Examples of predicted probablities for the above classes:
[[0.013 0.987]
 [0.283 0.717]
 [0.004 0.996]
 [0.976 0.024]
 [0.07  0.93 ]]


In [11]:

#Evaluate model with confusion matrix
prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob[:, 0]>=0.5, 0, 1)
print("First five predictions:")
print(prediction[0:5], '\n') # First 5 predictions after labeling

# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(prediction, Y_test),'\n')
# Accuracy

print(f"Accuracy: {np.mean(prediction == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,prediction):.3}")
print(classification_report(Y_test,prediction, digits = 3))


First five predictions:
[1 1 1 0 1] 

Consufion matrix:
heard    0     1
row_0           
0      223    19
1      106  1080 

Accuracy: 0.912
F1:0.945
              precision    recall  f1-score   support

           0      0.921     0.678     0.781       329
           1      0.911     0.983     0.945      1099

    accuracy                          0.912      1428
   macro avg      0.916     0.830     0.863      1428
weighted avg      0.913     0.912     0.907      1428



In [16]:
naive = np.ones(Y_test.shape[0])
# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(naive, Y_test),'\n')
# Accuracy
print(f"Accuracy: {np.mean(naive == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,naive):.2}")
print(classification_report(Y_test,naive, digits = 3))



Consufion matrix:
heard    0     1
row_0           
1.0    329  1099 

Accuracy: 0.770
F1:0.87
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       329
           1      0.770     1.000     0.870      1099

    accuracy                          0.770      1428
   macro avg      0.385     0.500     0.435      1428
weighted avg      0.592     0.770     0.669      1428



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
