In [50]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix, classification_report

In [51]:
# Loading the data
file_path = Path('./Output_Files/paces_bonks_2015.csv')
df_bonk_log = pd.read_csv(file_path)
df_bonk_log.head()

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K),Pace (30-35K),Pace (35-40K),Overall Pace,Calculated Bonk
0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,2.94,3.0,3.05,3.11,3.13,3.18,3.2,2.93,4.93,0
1,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.17,3.2,2.94,4.97,0
2,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,2.94,3.0,3.06,3.09,3.13,3.18,3.21,3.15,5.0,0
3,10,"Korir, Wesley",32,M,Kitale,,KEN,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.09,5.0,0
4,5,"Tola, Tadese",27,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.33,5.1,0


In [52]:
# Defining the target and features set 
y = df_bonk_log["Calculated Bonk"]
X = df_bonk_log.drop(["Calculated Bonk","Pace (30-35K)","Pace (35-40K)","Name","Bib","City","State","Country","M/F","Overall Pace"], axis=1)
X.head()

Unnamed: 0,Age,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K)
0,25,2.94,3.0,3.05,3.11,3.13,3.18
1,30,2.94,3.0,3.05,3.1,3.13,3.17
2,28,2.94,3.0,3.06,3.09,3.13,3.18
3,32,2.94,3.0,3.05,3.1,3.13,3.18
4,27,2.94,3.0,3.05,3.1,3.13,3.18


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)
Counter(y_train)

Counter({0: 17236, 1: 2492})

In [54]:
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [55]:
Counter(y_resampled)

Counter({0: 17236, 1: 17236})

In [56]:
model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [61]:
predictions = model.predict(X_test)
confusion_matrix(y_test, predictions)

array([[4737, 1009],
       [ 246,  584]], dtype=int64)

In [62]:
acc_score = balanced_accuracy_score(y_test, predictions)

In [64]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.95      0.82      0.70      0.88      0.76      0.59      5746
          1       0.37      0.70      0.82      0.48      0.76      0.57       830

avg / total       0.88      0.81      0.72      0.83      0.76      0.59      6576



In [65]:
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(cm,
                         index=["Actual No Bonk", "Actual Bonk"], 
                         columns=["Predicted No Bonk", "Predicted Bonk"])

cm_df

Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,4737,1009
Actual Bonk,246,584


In [66]:
# Produce report
report = classification_report(y_test, predictions)

In [67]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,4737,1009
Actual Bonk,246,584


Accuracy Score : 0.7640070200747298
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.82      0.88      5746
           1       0.37      0.70      0.48       830

    accuracy                           0.81      6576
   macro avg       0.66      0.76      0.68      6576
weighted avg       0.88      0.81      0.83      6576

