# **Marathon time Predictions**

## **Import libraries**

In [290]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [291]:
!pip install pandas-profiling



## **Data Exploration**

In [292]:
from pandas_profiling import ProfileReport

df = pd.read_csv('/kaggle/input/marathon-time-predictions/MarathonData.csv')
profile = ProfileReport(df, title="Profiling Report")

In [293]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

## **Data Preprocessing**

In [294]:
# Check head of dataframe
df.head()

Unnamed: 0,id,Marathon,Name,Category,km4week,sp4week,CrossTraining,Wall21,MarathonTime,CATEGORY
0,1,Prague17,Blair MORGAN,MAM,132.8,14.434783,,1.16,2.37,A
1,2,Prague17,Robert Heczko,MAM,68.6,13.674419,,1.23,2.59,A
2,3,Prague17,Michon Jerome,MAM,82.7,13.520436,,1.3,2.66,A
3,4,Prague17,Daniel Or lek,M45,137.5,12.258544,,1.32,2.68,A
4,5,Prague17,Luk ? Mr zek,MAM,84.6,13.945055,,1.36,2.74,A


In [295]:
# Check unique values of each column
for i in df.columns:
    if type(i[0])!='int64' and type(i[0])!='float64':
        print(i, df[i].unique())

id [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87]
Marathon ['Prague17']
Name ['Blair MORGAN' 'Robert Heczko' 'Michon Jerome' 'Daniel Or\xa0lek'
 'Luk\xa0? Mr\xa0zek' 'David Pecina' 'Tomas Drabek' 'Jan Rada'
 'martin ?indel\xa0?' 'Maksim Remezau' 'Jaroslaw Marchewka'
 'Tom\xa0? K?e?ek' 'Ji?، Polcar' 'Denis Wachtl' 'David Lehnen'
 'Jپrgen Steiner' 'David Marek' 'Jan Fri?ek' 'Vlastimil Lys\xa0k'
 'Radek Dvo?\xa0k' 'Jind?ich Lisى' 'Carlos Valladares Calvo'
 'Jiri Syrovatko' 'Florian Kobler' 'Jan Ro?ek' 'Cat Simpson'
 'Pavel Hlo?ek' 'Enrico Ballerin' 'Jaroslav Jur\xa0sek'
 'Luka Slap?ak Pelliccioni' 'Ondrej Barta' 'Emilio Zamarriego Garcia'
 'Ian  LoriggioIan Loriggio' 'Pavel MarekPavel Marek' 'Jan Kervitcer'
 'Luk\xa0? Kozubik' 'David CoxDavid Cox' 'Brian Parkinson'
 'Kate?i

In [296]:
# Check shape of dataframe and check null value count
print(df.shape)
print(df.isnull().sum())

(87, 10)
id                0
Marathon          0
Name              0
Category          6
km4week           0
sp4week           0
CrossTraining    74
Wall21            0
MarathonTime      0
CATEGORY          0
dtype: int64


In [297]:
# Check types of each field
print(df.dtypes)

id                 int64
Marathon          object
Name              object
Category          object
km4week          float64
sp4week          float64
CrossTraining     object
Wall21            object
MarathonTime     float64
CATEGORY          object
dtype: object


In [298]:
df_copy = df.copy() # Make a copy of the dataset
df = df.drop("CrossTraining", axis=1) # Drop column with mostly NULL Values

In [299]:
df.describe()

Unnamed: 0,id,km4week,sp4week,MarathonTime
count,87.0,87.0,87.0,87.0
mean,44.0,62.347126,139.840706,3.31908
std,25.258662,26.956019,1191.427864,0.376923
min,1.0,17.9,8.031414,2.37
25%,22.5,44.2,11.498168,3.045
50%,44.0,58.8,12.163424,3.32
75%,65.5,77.5,12.854036,3.605
max,87.0,137.5,11125.0,3.98


In [300]:
len(df["Marathon"].unique()), len(df["CATEGORY"].unique()), len(df["Category"].unique()), len(df["Name"].unique())

(1, 4, 7, 86)

In [301]:
# Drop Marathon as it has only 1 value, and names because it has high cardinality
df = df.drop(["Marathon", "Name"], axis=1)

In [302]:
# Handle Category Column's NULL values
filtered_df = df[df['Category'].notna()]
df = filtered_df
df.isna().sum()

id              0
Category        0
km4week         0
sp4week         0
Wall21          0
MarathonTime    0
CATEGORY        0
dtype: int64

In [303]:
# One-Hot Encode the CATEGORY Column
OH_CATEGORY = pd.get_dummies(df["CATEGORY"], drop_first=True).astype(int)
OH_CATEGORY.head()

Unnamed: 0,B,C,D
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [305]:
# Concat the One hot encoded dataframe and the original dataframe
df = pd.concat([df, OH_CATEGORY], axis=1)
df = df.drop("CATEGORY", axis=1) # Drop the one hot encoded column

In [306]:
print(df.dtypes)

id                int64
Category         object
km4week         float64
sp4week         float64
Wall21           object
MarathonTime    float64
B                 int64
C                 int64
D                 int64
dtype: object


In [307]:
# Check the rows with this weird value
df[df['Wall21'] == ' -   ']

Unnamed: 0,id,Category,km4week,sp4week,Wall21,MarathonTime,B,C,D


In [308]:
# Calculate mean of the available rows in the column "Wall21"
b=0
c=0
for i in df["Wall21"].unique():
    if i == ' -   ':
        continue
    elif float(i):
        c += float(i)
        b += 1
b, c

(51, 82.80000000000001)

In [309]:
# Replace the weird string values with the mean of that column
Mean = c/b
df.loc[df['Wall21'] == ' -   ', 'Wall21'] = Mean
df.head()

Unnamed: 0,id,Category,km4week,sp4week,Wall21,MarathonTime,B,C,D
0,1,MAM,132.8,14.434783,1.16,2.37,0,0,0
1,2,MAM,68.6,13.674419,1.23,2.59,0,0,0
2,3,MAM,82.7,13.520436,1.3,2.66,0,0,0
3,4,M45,137.5,12.258544,1.32,2.68,0,0,0
4,5,MAM,84.6,13.945055,1.36,2.74,0,0,0


In [310]:
df["Wall21"] = df["Wall21"].astype(np.float64)
print(df.dtypes)

id                int64
Category         object
km4week         float64
sp4week         float64
Wall21          float64
MarathonTime    float64
B                 int64
C                 int64
D                 int64
dtype: object


In [311]:
# Label encode the Category Column
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows  
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['Category']= label_encoder.fit_transform(df['Category']) 

In [312]:
df = df.drop_duplicates() # Drop duplicate rows, if any
df.shape

(81, 9)

## **Split data into train and test splits**

In [313]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(['id', 'MarathonTime'], axis=1)
y = df['MarathonTime']

scaler = StandardScaler() # To scale the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [314]:
!pip install pycaret



In [315]:
from pycaret.regression import *

data = df.drop('id', axis=1) # Create temp dataframe to use for pycaret
s = setup(data, target = 'MarathonTime', session_id = 123)

# model training and selection
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,MarathonTime
2,Target type,Regression
3,Original data shape,"(81, 8)"
4,Transformed data shape,"(81, 8)"
5,Transformed train set shape,"(56, 8)"
6,Transformed test set shape,"(25, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0563,0.0056,0.0689,0.9174,0.0165,0.0175,0.073
lr,Linear Regression,0.0583,0.0057,0.0708,0.9063,0.0171,0.0184,0.421
lar,Least Angle Regression,0.0583,0.0057,0.0708,0.9063,0.0171,0.0184,0.02
br,Bayesian Ridge,0.0592,0.0058,0.0714,0.9061,0.0173,0.0187,0.015
huber,Huber Regressor,0.0613,0.0066,0.0765,0.9035,0.0179,0.0187,0.024
rf,Random Forest Regressor,0.0627,0.0075,0.0789,0.8976,0.019,0.0198,0.101
gbr,Gradient Boosting Regressor,0.0688,0.0078,0.0821,0.8953,0.0197,0.0216,0.032
catboost,CatBoost Regressor,0.067,0.0097,0.0841,0.8819,0.02,0.0211,0.755
ada,AdaBoost Regressor,0.0689,0.009,0.0862,0.8781,0.0207,0.0218,0.042
dt,Decision Tree Regressor,0.0647,0.008,0.0844,0.8661,0.0201,0.0202,0.015


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

In [316]:
# evaluate trained model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [317]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

extra_trees_model = ExtraTreesRegressor(n_estimators=100, max_depth=None, random_state=42)
extra_trees_model.fit(X_train, y_train)

y_pred = extra_trees_model.predict(X_test)
    
print(f'Classifier: Extra Trees Regressor')
print(f'MSE: {mean_absolute_error(y_test, y_pred):.2f}')
print(f'MAE: {mean_squared_error(y_test, y_pred):.2f}')
print(f'RMSE: {math.sqrt(mean_squared_error(y_test, y_pred)):.2f}')
print(f'R2 Score: {r2_score(y_test, y_pred):.2f}')

Classifier: Extra Trees Regressor
MSE: 0.05
MAE: 0.01
RMSE: 0.08
R2 Score: 0.96


## **Using manual model declarations as a dictionary of regressors**

In [None]:
# Import libraries for Machine Learning models and metrics

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor

In [None]:
# Define a list of classifiers to test
regressors = {
    'Extra Trees Regressor': ExtraTreesRegressor(n_estimators=100),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Light GBM': LGBMRegressor(),
    'XG Boost': XGBRegressor(),
    'CatBoost': CatBoostRegressor(),
    'AdaBoost': AdaBoostRegressor()
}

trained_models = {}

# Train and evaluate each classifier
for name, clf in regressors.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred)
    
    print(f'Classifier: {name}')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
    print(f'Precision: {precision_score(y_test, y_pred, average="macro"):.2f}')
    print(f'Recall: {recall_score(y_test, y_pred, average="macro"):.2f}')
    print(f'F1 Score: {f1_score(y_test, y_pred, average="macro"):.2f}')
    print(f'Classification Report:\n{report}\n')
    
    trained_models[name] = clf

# **Conclusion - Final Metrics**

### Classifier: Extra Trees Regressor
### Mean Squared Error: 0.05
### Mean Absolute Error: 0.01
### Root Mean Square Error: 0.08
### R2 Score: 0.96