In [2]:
# Discriminant Analysis
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import sklearn.discriminant_analysis as skl_da
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score,confusion_matrix,f1_score)


In [3]:
#Import data
data = pd.read_csv("../../machine_learning/siren_data_train.csv", sep=",")
print(f"Data size: {data.memory_usage().sum() / 1e6:.2f} MB")
print("The first 10 rows in the data:")


Data size: 0.59 MB
The first 10 rows in the data:


In [38]:
#Calculate distance to horn
list_loc_horn = []
list_loc_person = []
list_distance_to_horn = []

x_cor_horn = data["near_x"]
y_cor_horn = data["near_y"]

x_cor_person = data["xcoor"]
y_cor_person = data["ycoor"]

for row in range(len(x_cor_horn)):
    loc_horn = [x_cor_horn[row], y_cor_horn[row]]
    list_loc_horn.append(loc_horn)

    loc_person = [x_cor_person[row], y_cor_person[row]]
    list_loc_person.append(loc_person)

for i in range(len(x_cor_horn)):
    
    coordinate_horn = list_loc_horn[i] 
    coordinate_person = list_loc_person[i]
    distance_to_horn = math.dist(coordinate_horn,coordinate_person)
    list_distance_to_horn.append(distance_to_horn)
data["distance to nearest horn"] = list_distance_to_horn

data.head(10)


Unnamed: 0,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,noise,in_vehicle,asleep,no_windows,age,distance to nearest horn
0,2712,1998301.0,9011692.0,-171.588672,1,0,1999193.0,9011824,0,0,0,0,59,901.283517
1,2721,1928907.0,8954624.0,-51.208102,1,0,1928298.0,8955382,0,0,0,0,29,972.00626
2,297,2026384.0,8256164.0,39.018754,1,0,2025706.0,8255615,0,0,0,0,32,872.340924
3,739,1743184.0,8052652.0,15.046022,1,0,1742935.0,8052585,0,0,0,0,36,257.804449
4,1852,1350375.0,7909850.0,144.60317,1,0,1350807.0,7909543,0,0,0,0,55,529.686791
5,2737,1934971.0,8959613.0,-159.477621,0,1,1938739.0,8961023,0,0,0,1,62,4023.130333
6,3692,1327964.0,7940968.0,18.712045,1,1,1327733.0,7940890,0,0,0,0,74,244.293455
7,404,2005312.0,8234592.0,146.602179,1,0,2007113.0,8233405,0,0,0,0,24,2157.25058
8,133,2010260.0,8244737.0,91.888991,1,0,2010284.0,8244010,0,0,0,0,66,727.512654
9,2003,1339570.0,7892866.0,160.654726,1,1,1340898.0,7892400,0,0,0,0,53,1407.236379


In [58]:
#Splitting test and training data
np.random.seed(1)
trainI = np.random.choice(data.shape[0], size=4000, replace=False)
trainIndex = data.index.isin(trainI)
train = data.iloc[trainIndex] #training set
test = data.iloc[~trainIndex]

X_test = train[['building','noise','asleep','in_vehicle','no_windows','age','distance to nearest horn','near_angle']]
Y_test = train['heard']
X_train = train[['building','noise','asleep','in_vehicle','no_windows','age','distance to nearest horn','near_angle']]
Y_train = train['heard']



In [59]:
#Tuning best model
model = skl_da.LinearDiscriminantAnalysis()
params = {'solver':['lsqr','eigen'],'shrinkage':[None,'auto',0.5]}
grid_search = GridSearchCV(model, params, cv=5, scoring='f1')
grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
print(best_params)
f1_scores = grid_search.cv_results_['mean_test_score']
print('f1:',f1_scores )
params = grid_search.cv_results_['params']
print(params)
best_lda_model = grid_search.best_estimator_
accuracy = best_lda_model.score(X_test, Y_test)
print(accuracy)




{'shrinkage': None, 'solver': 'lsqr'}
f1: [0.93820912 0.93820912 0.93659596 0.93659596 0.92452398 0.92452398]
[{'shrinkage': None, 'solver': 'lsqr'}, {'shrinkage': None, 'solver': 'eigen'}, {'shrinkage': 'auto', 'solver': 'lsqr'}, {'shrinkage': 'auto', 'solver': 'eigen'}, {'shrinkage': 0.5, 'solver': 'lsqr'}, {'shrinkage': 0.5, 'solver': 'eigen'}]
0.90425


In [60]:
#predict 
predict_prob = best_lda_model.predict_proba(X_test)
print('The class order in the model:')
print(best_lda_model.classes_)
print('Examples of predicted probablities for the above classes:')
with np.printoptions(suppress=True, precision=3): # Supress scienti
    print(predict_prob[0:5]) # inspect the first 5 predictions

The class order in the model:
[0 1]
Examples of predicted probablities for the above classes:
[[0.004 0.996]
 [0.005 0.995]
 [0.01  0.99 ]
 [0.977 0.023]
 [0.015 0.985]]


In [61]:

#Evaluate model with confusion matrix
prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob[:, 0]>=0.5, 0, 1)
print("First five predictions:")
print(prediction[0:5], '\n') # Inspect the first 5 predictions after labeling.
# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(prediction, Y_test),'\n')
# Accuracy
print(f"Accuracy: {np.mean(prediction == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,prediction):.2}")


First five predictions:
[1 1 1 0 1] 

Consufion matrix:
heard    0     1
row_0           
0      646    61
1      322  2971 

Accuracy: 0.904
F1:0.94


In [53]:
naive= np.ones(Y_test.shape[0])
# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(naive, Y_test),'\n')
# Accuracy
print(f"Accuracy: {np.mean(naive == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,naive):.2}")


Consufion matrix:
heard    0     1
row_0           
1.0    968  3032 

Accuracy: 0.758
F1:0.86


In [48]:
#Tuning default model
model = skl_da.LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
accuracy = model.score(X_test, Y_test)
print(accuracy)

#predict default model
predict_prob_default = model.predict_proba(X_test)
print('The class order in the model:')
print(model.classes_)
print('Examples of predicted probablities for the above classes:')
with np.printoptions(suppress=True, precision=3): # Supress scienti
    print(predict_prob_default[0:5]) # inspect the first 5 predictions

prediction = np.empty(len(X_test), dtype=object)
prediction = np.where(predict_prob_default[:, 0]>=0.5, 0, 1)
print("First five predictions:")
print(prediction[0:5], '\n') # Inspect the first 5 predictions after labeling.
# Confusion matrix
print("Consufion matrix:")
print(pd.crosstab(prediction, Y_test),'\n')
# Accuracy
print(f"Accuracy: {np.mean(prediction == Y_test):.3f}")
print(f"F1:{f1_score(Y_test,prediction):.2}")


0.90425
The class order in the model:
[0 1]
Examples of predicted probablities for the above classes:
[[0.004 0.996]
 [0.005 0.995]
 [0.01  0.99 ]
 [0.977 0.023]
 [0.016 0.984]]
First five predictions:
[1 1 1 0 1] 

Consufion matrix:
heard    0     1
row_0           
0      646    61
1      322  2971 

Accuracy: 0.904
F1:0.94
