In [81]:
# Logistic regression

import pandas as pd
import numpy as np
from collections import defaultdict
import math
import matplotlib.pyplot as plt
from sklearn . model_selection import train_test_split
import sklearn.linear_model as LogisticRegression
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score,precision_score, recall_score)
import seaborn as sns

RANDOM_SEED = 42

In [82]:
# Importing the data from the csv file
training_data = pd.read_csv("../siren_data_train.csv", sep=",")
training_data['heard'] = training_data['heard'].replace({'hearing': 1, 'not_hearing': 0})
print('Data imported!')
print(training_data)

Data imported!
      near_fid        near_x        near_y  near_angle  heard  building  \
0         2712  1.998301e+06  9.011692e+06 -171.588672      1         0   
1         2721  1.928907e+06  8.954624e+06  -51.208102      1         0   
2          297  2.026384e+06  8.256164e+06   39.018754      1         0   
3          739  1.743184e+06  8.052652e+06   15.046022      1         0   
4         1852  1.350375e+06  7.909850e+06  144.603170      1         0   
...        ...           ...           ...         ...    ...       ...   
5705         5  2.008871e+06  8.255775e+06 -176.234663      1         0   
5706      4069  1.981871e+06  8.270452e+06   45.691415      1         0   
5707      2170  1.463760e+06  8.074997e+06 -175.473118      1         0   
5708      1591  1.479843e+06  7.526377e+06  142.958054      1         0   
5709      4034  1.995078e+06  8.280029e+06  -61.591457      1         1   

          xcoor    ycoor  noise  in_vehicle  asleep  no_windows  age  
0     1999193

In [83]:
# Creating a parameter distance to nearest horn

# Calculate the distance to nearest horn

list_loc_horn = []
list_loc_person = []
list_distance_to_horn = []

x_cor_horn = training_data['near_x']
y_cor_horn = training_data['near_y']

x_cor_person = training_data['xcoor']
y_cor_person = training_data['ycoor']

for row in range(len(x_cor_horn)):
    loc_horn = [x_cor_horn[row], y_cor_horn[row]]
    list_loc_horn.append(loc_horn)

    loc_person = [x_cor_person[row], y_cor_person[row]]
    list_loc_person.append(loc_person)

for i in range(len(x_cor_horn)):
    
    coordinate_horn = list_loc_horn[i] 
    coordinate_person = list_loc_person[i]
    distance_to_horn = math.dist(coordinate_horn,coordinate_person)
    list_distance_to_horn.append(distance_to_horn)
   
training_data["distance_nearest_horn"] = list_distance_to_horn

# Dropping unnecessary features

# training_data = training_data.drop(['near_x', 'near_y', 'xcoor', 'ycoor', 'near_fid'], axis=1)
print(training_data)

      near_fid        near_x        near_y  near_angle  heard  building  \
0         2712  1.998301e+06  9.011692e+06 -171.588672      1         0   
1         2721  1.928907e+06  8.954624e+06  -51.208102      1         0   
2          297  2.026384e+06  8.256164e+06   39.018754      1         0   
3          739  1.743184e+06  8.052652e+06   15.046022      1         0   
4         1852  1.350375e+06  7.909850e+06  144.603170      1         0   
...        ...           ...           ...         ...    ...       ...   
5705         5  2.008871e+06  8.255775e+06 -176.234663      1         0   
5706      4069  1.981871e+06  8.270452e+06   45.691415      1         0   
5707      2170  1.463760e+06  8.074997e+06 -175.473118      1         0   
5708      1591  1.479843e+06  7.526377e+06  142.958054      1         0   
5709      4034  1.995078e+06  8.280029e+06  -61.591457      1         1   

          xcoor    ycoor  noise  in_vehicle  asleep  no_windows  age  \
0     1999193.0  9011824   

In [84]:
# Creating training set and test set

# Creating X - everything about the column 'heard'

X = training_data.drop(['heard'], axis = 1)

# Creating Y - the column 'heard'

y = training_data['heard']

# Splitting the data into 75% for training and 25% for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=RANDOM_SEED, shuffle=True)
print("Data splitted!") 

Data splitted!


In [85]:
# Learn a logistic model

# Creating logisitc regression model

logistic_regression_model = LogisticRegression.LogisticRegression()
 
# Fitting model to training data

logistic_regression_model.fit(X_train, y_train)

LogisticRegression()

In [86]:
# Evaluate the model

y_pred = logistic_regression_model.predict(X_test)

def print_performance_metrics(y_true, y_pred):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred), "\n")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2}")
    print(f"Recall: {recall_score(y_true, y_pred):.2}")
    print(f"Precision: {precision_score(y_true, y_pred):.2}")
    print(f"F1: {f1_score(y_true, y_pred):.2}")

print_performance_metrics(y_test, y_pred)

Confusion Matrix:
 [[ 200  154]
 [  10 1064]] 

Accuracy: 0.89
Recall: 0.99
Precision: 0.87
F1: 0.93


In [87]:
# Confusion Matrix: all parameters
# [[  16  338]
# [   1 1073]] 

# Accuracy: 0.76
# Recall: 1.0
# Precision: 0.76
# F1: 0.86

# Confusion Matrix: with distance_nearest_horn, without near_x, near_y, xcoor, ycoor
# Confusion Matrix:
# [[ 249  105]
# [  27 1047]] 

# Accuracy: 0.91
# Recall: 0.97
# Precision: 0.91
# F1: 0.94

# Confusion Matrix: with distance_nearest_horn, without near_x, near_y, xcoor, ycoor, and near_fid
# Confusion Matrix:
# [[ 257   97]
# [  26 1048]] 

# Accuracy: 0.91
# Recall: 0.98
# Precision: 0.92
# F1: 0.94

# Tuning
# CV5 

# Grid Search
    # Regularization strength
    # Penalty
    # Solver

# Score = f1 

In [88]:


# Parameter tuning
# Choices of parameters: penalty, solver, 

