In [64]:
# Logistic regression

import pandas as pd
import numpy as np
from collections import defaultdict
import math
import matplotlib.pyplot as plt
from sklearn . model_selection import train_test_split
import sklearn.linear_model as lm
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score,precision_score, recall_score)
import seaborn as sns
from sklearn.model_selection import GridSearchCV

RANDOM_SEED = 42

In [65]:
# Importing the data from the csv file
training_data = pd.read_csv("../siren_data_train.csv", sep=",")
training_data['heard'] = training_data['heard'].replace({'hearing': 1, 'not_hearing': 0})
print('Data imported!')
print(training_data)

Data imported!
      near_fid        near_x        near_y  near_angle  heard  building  \
0         2712  1.998301e+06  9.011692e+06 -171.588672      1         0   
1         2721  1.928907e+06  8.954624e+06  -51.208102      1         0   
2          297  2.026384e+06  8.256164e+06   39.018754      1         0   
3          739  1.743184e+06  8.052652e+06   15.046022      1         0   
4         1852  1.350375e+06  7.909850e+06  144.603170      1         0   
...        ...           ...           ...         ...    ...       ...   
5705         5  2.008871e+06  8.255775e+06 -176.234663      1         0   
5706      4069  1.981871e+06  8.270452e+06   45.691415      1         0   
5707      2170  1.463760e+06  8.074997e+06 -175.473118      1         0   
5708      1591  1.479843e+06  7.526377e+06  142.958054      1         0   
5709      4034  1.995078e+06  8.280029e+06  -61.591457      1         1   

          xcoor    ycoor  noise  in_vehicle  asleep  no_windows  age  
0     1999193

In [55]:
# Creating a parameter distance to nearest horn

# Calculate the distance to nearest horn

list_loc_horn = []
list_loc_person = []
list_distance_to_horn = []

x_cor_horn = training_data['near_x']
y_cor_horn = training_data['near_y']

x_cor_person = training_data['xcoor']
y_cor_person = training_data['ycoor']

for row in range(len(x_cor_horn)):
    loc_horn = [x_cor_horn[row], y_cor_horn[row]]
    list_loc_horn.append(loc_horn)

    loc_person = [x_cor_person[row], y_cor_person[row]]
    list_loc_person.append(loc_person)

for i in range(len(x_cor_horn)):
    
    coordinate_horn = list_loc_horn[i] 
    coordinate_person = list_loc_person[i]
    distance_to_horn = math.dist(coordinate_horn,coordinate_person)
    list_distance_to_horn.append(distance_to_horn)
   
#training_data["distance_nearest_horn"] = list_distance_to_horn

# Dropping unnecessary features

# training_data = training_data.drop(['near_x', 'near_y', 'xcoor', 'ycoor', 'near_fid', 'near_angle' ], axis=1)
print(training_data)

      heard  building  noise  in_vehicle  asleep  no_windows  age  \
0         1         0      0           0       0           0   59   
1         1         0      0           0       0           0   29   
2         1         0      0           0       0           0   32   
3         1         0      0           0       0           0   36   
4         1         0      0           0       0           0   55   
...     ...       ...    ...         ...     ...         ...  ...   
5705      1         0      0           0       0           0   29   
5706      1         0      0           0       0           0   49   
5707      1         0      0           0       0           0   62   
5708      1         0      0           0       0           0   37   
5709      1         1      0           0       0           0   33   

      distance_nearest_horn  
0                901.283517  
1                972.006260  
2                872.340924  
3                257.804449  
4                529.

In [66]:
# Creating training set and test set

# Creating X - everything about the column 'heard'

X = training_data.drop(['heard'], axis = 1)

# Creating Y - the column 'heard'

y = training_data['heard']

# Splitting the data into 75% for training and 25% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=RANDOM_SEED, shuffle=True)
print("Data splitted!") 

Data splitted!


In [67]:
# Learn a logistic model

# Creating logisitc regression model

logistic_regression_model = lm.LogisticRegression()

In [68]:
# Parameter tuning

model = logistic_regression_model

# Performing a grid search for performing hyper-parameter optimization, finding the optimal combination of hyper-parameters

# C = Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
# Solver = Algorithm to use in the optimization problem. Default is ‘lbfgs’. 

params = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],'penalty':['l1','l2'], 'solver':['sag', 'saga', 'liblinear']}
grid_search = GridSearchCV(model, params, cv=5)

# Fitting model to training data
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print('Best parameters: ', best_params)

f1_scores = grid_search.cv_results_['mean_test_score']
print('f1:',f1_scores )

params = grid_search.cv_results_['params']
print("Params: ", params)

best_lr_model = grid_search.best_estimator_
accuracy = best_lr_model.score(X_test, y_test)
print("Accuracy: ", accuracy)



KeyboardInterrupt: 

In [59]:
# Evaluate best model

y_pred = best_lr_model.predict(X_test)

def print_performance_metrics(y_true, y_pred):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred), "\n")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2}")
    print(f"Recall: {recall_score(y_true, y_pred):.2}")
    print(f"Precision: {precision_score(y_true, y_pred):.2}")
    print(f"F1: {f1_score(y_true, y_pred):.2}")

print_performance_metrics(y_test, y_pred)

Confusion Matrix:
 [[ 265   89]
 [  28 1046]] 

Accuracy: 0.92
Recall: 0.97
Precision: 0.92
F1: 0.95


In [None]:


# Confusion Matrix: utan coord

# Best parameters:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
# f1: [       nan 0.83908958 0.88673241 0.84189195 0.83932295 0.88649849
#        nan 0.83932295 0.88883467 0.84165831 0.83932295 0.89140476
#        nan 0.83932295 0.91872785 0.84165831 0.83932295 0.90611975
#        nan 0.83932295 0.92293373 0.84165831 0.83932295 0.90658731
#        nan 0.83932295 0.92246671 0.84165831 0.83932295 0.91452415
#        nan 0.83932295 0.92270036 0.84165831 0.83932295 0.90869012
#        nan 0.83932295 0.92270036 0.84165831 0.83932295 0.91452415]

# Confusion Matrix:
# [[ 265   89]
# [  29 1045]] 

# Accuracy: 0.92
# Recall: 0.97
# Precision: 0.92
# F1: 0.95

# Grid search och confusion matrix: utan coord och utan near_fid

# Best parameters:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# f1: [       nan 0.27673857 0.88766562 0.35918003 0.27860582 0.88813291
#        nan 0.2781388  0.88836738 0.35941368 0.27837245 0.89023544
#        nan 0.27837245 0.9175599  0.35941368 0.27860582 0.89467415
#        nan 0.27837245 0.922934   0.35941368 0.27837245 0.89934705
#        nan 0.27860609 0.92293373 0.35964732 0.27860582 0.90587656
#        nan 0.27837245 0.92340075 0.35941368 0.27837245 0.90587656
#        nan 0.27837245 0.92340075 0.35941368 0.27837272 0.90587656]

# Confusion Matrix:
# [[ 265   89]
# [  27 1047]] 

# Accuracy: 0.92
# Recall: 0.97
# Precision: 0.92
# F1: 0.95

# Grid search och confusion matrix: utan coord och utan near_fid, utan near_angle
# Best parameters:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# f1: [       nan 0.27673884 0.88813291 0.35777926 0.27743923 0.88906722
#        nan 0.27743923 0.88930196 0.35754561 0.27767288 0.8900018
#        nan 0.27743923 0.91802773 0.35777926 0.27743923 0.8900018
#        nan 0.27743923 0.92223252 0.35777926 0.27743923 0.8900018
#        nan 0.27767288 0.92269954 0.35777926 0.27743923 0.8900018
#        nan 0.27743923 0.92269981 0.35777926 0.27743923 0.8900018
#        nan 0.27743923 0.92269954 0.35777926 0.27743923 0.8900018 ]

# Confusion Matrix:
# [[ 265   89]
# [  28 1046]] 

# Accuracy: 0.92
# Recall: 0.97
# Precision: 0.92
#F1: 0.95"

In [None]:
# Testing with a naive model
