## Import Race Results

In [357]:
# Import tools

import pandas as pd



In [578]:
# Import data

r1 = pd.read_csv("race_results/AngelesCrest.csv")
r2 = pd.read_csv("race_results/TheBear.csv")
r3 = pd.read_csv("race_results/CascadeCrest.csv")
r4 = pd.read_csv("race_results/HardRock.csv")
r5 = pd.read_csv("race_results/HURT.csv")
r6 = pd.read_csv("race_results/IMTUF.csv")
r7 = pd.read_csv("race_results/JavelinaJundred.csv")
r8 = pd.read_csv("race_results/KettleMoraine.csv")
r9 = pd.read_csv("race_results/OldDominion.csv")
r10 = pd.read_csv("race_results/OrcasIsland.csv")
r11 = pd.read_csv("race_results/RunRabbitRun.csv")
r12 = pd.read_csv("race_results/TunnelHill.csv")
r13 = pd.read_csv("race_results/WasatchFront.csv")
r14 = pd.read_csv("race_results/WesternStates.csv")
r15 = pd.read_csv("race_results/Yeti.csv")

race_results = pd.concat([r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15], axis = 0)
race_results.head()

Unnamed: 0,Race,Year,Cutoff,Place,First,Last,City,Location,Age,Gender,GP,Time,Rank
0,Angeles Crest,2019,33,1,Ruperto,Romero,Huntington Park,CA,55,M,1,19:39:09,92.58
1,Angeles Crest,2019,33,2,Wyatt,Million,Santa Monica,CA,23,M,2,20:46:07,93.87
2,Angeles Crest,2019,33,3,Dominic,Grossman,Wrightwood,CA,32,M,3,21:18:01,89.34
3,Angeles Crest,2019,33,4,Jonas,Johansson,Jarfalla,,40,M,4,21:43:32,86.36
4,Angeles Crest,2019,33,5,Eric,Earnshaw,Clovis,CA,40,M,5,22:28:48,88.64


In [579]:
# Remove DNFs and DNSs
race_results = race_results[race_results["Place"] > 0]
len(race_results)

11231

In [565]:
# Remove unneeded columns

race_results.drop(["Race", "Year", "First", "Last", "City", "Location", "Place", "GP"], axis = 1, inplace = True)
race_results.fillna("INTL", inplace = True)
race_results.head()

Unnamed: 0,Cutoff,Age,Gender,Time,Rank
0,33,55,M,19:39:09,92.58
1,33,23,M,20:46:07,93.87
2,33,32,M,21:18:01,89.34
3,33,40,M,21:43:32,86.36
4,33,40,M,22:28:48,88.64


In [566]:
# Split the data into features and labels

np.random.seed(0)
race_results_shuffled = race_results.sample(frac = 1)
x = race_results_shuffled.drop("Time", axis = 1)

time = race_results_shuffled["Time"]
y = pd.to_timedelta(time).dt.total_seconds()

Unnamed: 0,Cutoff,Age,Gender,Rank
32,30,52,M,75.62
416,30,61,M,81.79
991,36,31,M,79.0
175,36,28,M,76.56
259,48,38,F,73.13


32      81533.0
416     77255.0
991     77388.0
175    120993.0
259    168558.0
Name: Time, dtype: float64

In [568]:
# Convert non-numerical values to numerical values (feature encoding)

x.dtypes

Cutoff     object
Age         int64
Gender     object
Rank      float64
dtype: object

In [569]:
# Turn the categories into numbers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Gender"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder = "passthrough")
x = transformer.fit_transform(x)
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5
0,0,1,0,30,52,75.62
1,0,1,0,30,61,81.79
2,0,1,0,36,31,79
3,0,1,0,36,28,76.56
4,1,0,0,48,38,73.13
...,...,...,...,...,...,...
11226,0,1,0,30,33,67.59
11227,0,1,0,36,57,70.35
11228,0,1,0,30,39,57.77
11229,1,0,0,30,31,69.38


In [570]:
# Split the data into training and test sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [571]:
# from sklearn.linear_model import Ridge

# model = Ridge()
# model.fit(x_train, y_train)
# model.score(x_test, y_test)

In [572]:
# model.score(x_train, y_train)

In [573]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(0)

model = RandomForestRegressor()
model.fit(x_train, y_train)

# Evaluation: r^2

model.score(x_test, y_test)

0.6961528338216965

In [574]:
model.score(x_train, y_train)

0.9558365545834053

In [575]:
# Evaluation: Mean absolute error

from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)

pd.to_timedelta(mae, unit = "s")

Timedelta('0 days 02:12:47.816180876')

In [576]:
df = pd.DataFrame()
df["Predicted"] = pd.to_timedelta(y_preds.round(0), unit = "s")
df["Actual"] = pd.to_timedelta(np.array(y_test), unit = "s")
df["Difference"] = abs(df["Predicted"] - df["Actual"])
df

Unnamed: 0,Predicted,Actual,Difference
0,1 days 01:02:02,1 days 04:40:12,0 days 03:38:10
1,1 days 04:04:21,0 days 23:42:22,0 days 04:21:59
2,1 days 05:12:42,1 days 07:26:12,0 days 02:13:30
3,0 days 23:42:20,1 days 01:44:33,0 days 02:02:13
4,1 days 06:25:32,1 days 02:49:05,0 days 03:36:27
...,...,...,...
2242,1 days 07:32:06,1 days 10:06:58,0 days 02:34:52
2243,1 days 04:51:06,1 days 08:30:55,0 days 03:39:49
2244,1 days 01:47:07,0 days 23:05:27,0 days 02:41:40
2245,1 days 03:29:16,1 days 03:43:39,0 days 00:14:23


In [577]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}