In [2]:
# Importing dependencies
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [14]:
# Loading the data
file_path = Path('./Output_Files/paces_bonks_2015.csv')
df_bonk_log = pd.read_csv(file_path)
df_bonk_log.head()

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K),Pace (30-35K),Pace (35-40K),Overall Pace,Calculated Bonk
0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,2.94,3.0,3.05,3.11,3.13,3.18,3.2,2.93,4.93,0
1,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.17,3.2,2.94,4.97,0
2,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,2.94,3.0,3.06,3.09,3.13,3.18,3.21,3.15,5.0,0
3,10,"Korir, Wesley",32,M,Kitale,,KEN,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.09,5.0,0
4,5,"Tola, Tadese",27,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.33,5.1,0


In [15]:
# Define the features set
X = df_bonk.copy()
X = df_bonk_log.drop(["Calculated Bonk","Pace (30-35K)","Pace (35-40K)","Name","Bib","City","State","Country","M/F","Overall Pace"], axis=1)
X.head()

Unnamed: 0,Age,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K)
0,25,2.94,3.0,3.05,3.11,3.13,3.18
1,30,2.94,3.0,3.05,3.1,3.13,3.17
2,28,2.94,3.0,3.06,3.09,3.13,3.18
3,32,2.94,3.0,3.05,3.1,3.13,3.18
4,27,2.94,3.0,3.05,3.1,3.13,3.18


In [16]:
# Define the target set
y = df_bonk_log["Calculated Bonk"].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [17]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 55)
X_train.shape

(19728, 7)

In [18]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [22]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual No Bonk", "Actual Bonk"], columns=["Predicted No Bonk", "Predicted Bonk"])

cm_df

Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,5651,137
Actual Bonk,610,178


In [23]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.886405109489051

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,5651,137
Actual Bonk,610,178


Accuracy Score : 0.886405109489051
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5788
           1       0.57      0.23      0.32       788

    accuracy                           0.89      6576
   macro avg       0.73      0.60      0.63      6576
weighted avg       0.86      0.89      0.86      6576



In [26]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.10691965, 0.12389386, 0.12374527, 0.12980236, 0.1348325 ,
       0.13810037, 0.24270599])

In [27]:
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2427059855146885, 'Pace (25-30K)'),
 (0.13810037200401015, 'Pace (20-25K)'),
 (0.13483250239176212, 'Pace (15-20K)'),
 (0.12980235759721406, 'Pace (10-15K)'),
 (0.12389386131120686, 'Pace (0-5K)'),
 (0.123745271690063, 'Pace (5-10K)'),
 (0.1069196494910553, 'Age')]