In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

from sqlalchemy import create_engine
from config import db_password

In [2]:
# Loading the data
#file_path = Path('./Output_Files/paces_bonks_2015.csv')
#df_bonk_log = pd.read_csv(file_path)

# Loading the data from SQL

# make connection string for database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/bonkers"

# create the database engine
engine = create_engine(db_string)

# create a query to select all rows from the table we want to make a dataframe for
query="SELECT * FROM paces_bonks_2015"

# make the datafram for the table
df_bonk_log = pd.read_sql_query(query, con=engine)

df_bonk_log.head()

Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K),Pace (30-35K),Pace (35-40K),Overall Pace,Calculated Bonk
0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,2.94,3.0,3.05,3.11,3.13,3.18,3.2,2.93,4.93,0
1,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.17,3.2,2.94,4.97,0
2,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,2.94,3.0,3.06,3.09,3.13,3.18,3.21,3.15,5.0,0
3,10,"Korir, Wesley",32,M,Kitale,,KEN,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.09,5.0,0
4,5,"Tola, Tadese",27,M,Addis Ababa,,ETH,2.94,3.0,3.05,3.1,3.13,3.18,3.2,3.33,5.1,0


In [3]:
# Defining the target and features set 
y = df_bonk_log["Calculated Bonk"]
X = df_bonk_log.drop(["Calculated Bonk","Pace (30-35K)","Pace (35-40K)","Name","Bib","City","State","Country","M/F","Overall Pace"], axis=1)
X.head()

Unnamed: 0,Age,Pace (0-5K),Pace (5-10K),Pace (10-15K),Pace (15-20K),Pace (20-25K),Pace (25-30K)
0,25,2.94,3.0,3.05,3.11,3.13,3.18
1,30,2.94,3.0,3.05,3.1,3.13,3.17
2,28,2.94,3.0,3.06,3.09,3.13,3.18
3,32,2.94,3.0,3.05,3.1,3.13,3.18
4,27,2.94,3.0,3.05,3.1,3.13,3.18


In [4]:
# Split into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    stratify=y)

X_train.shape

(19728, 7)

In [5]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000)      

In [6]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [8]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
12601,0,0
13300,0,0
7622,0,0
17729,0,0
9634,0,0
...,...,...
25346,0,1
7850,0,0
16009,0,0
18566,0,0


In [9]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8683090024330901

In [10]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual No Bonk", "Actual Bonk"], columns=["Predicted No Bonk", "Predicted Bonk"])

cm_df

Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,5631,114
Actual Bonk,752,79


In [11]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted No Bonk,Predicted Bonk
Actual No Bonk,5631,114
Actual Bonk,752,79


Accuracy Score : 0.8683090024330901
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      5745
           1       0.41      0.10      0.15       831

    accuracy                           0.87      6576
   macro avg       0.65      0.54      0.54      6576
weighted avg       0.82      0.87      0.83      6576

