# Feature Engineering Analysis and Machine Learning for Fast Race ML (FASTRACEML) - Horse Race prediction using Past Performance 

This notebook loads the engineered features CSV produced by the pipeline, performs some basic exploratory data analysis (EDA), and demonstrates a simple machine learning workflow using a linear regression model.

You can modify or extend the analysis as needed.

In [3]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting
%matplotlib inline

# Set plot style
sns.set(style='whitegrid')

In [5]:
# Define the path where you want to save the data
data_output_path = '../data/outputs/'

# Check the current working directory
print("Current Working Directory:", os.getcwd())

# Ensure the directory exists
directory = os.path.dirname(data_output_path)
if not os.path.exists(directory):
	os.makedirs(directory)
	print(f"Created directory: {directory}")

# List files in the directory to verify the path
print("Files in the directory:", os.listdir(directory))

print("Files in the directory:", os.listdir(data_output_path))

# Load the engineered features CSV (adjust the path as needed)
engineered_df = pd.read_csv(os.path.join(data_output_path, 'engineered_features.csv'))
print("Engineered Features DataFrame Shape:", engineered_df.shape)
engineered_df.head()

# Now you can use this variable to save your DataFrame
#engineered_df.to_csv(data_output_path, index=False)

Current Working Directory: c:\Users\SamBo\Projects\FastRaceML\main
Files in the directory: ['engineered_features.csv', 'Features', 'merged_data.csv', 'results.csv']
Files in the directory: ['engineered_features.csv', 'Features', 'merged_data.csv', 'results.csv']
Engineered Features DataFrame Shape: (1778, 76)


Unnamed: 0,track,date,race_no,distance_in_yards,surface,race_type,trainer_sts_current_meet,trainer_wins_current_meet,trainer_places_current_meet,trainer_shows_cureent_meet,...,res_final_time,res_track_condition,res_race_class,res_post_position,res_horse_weight,res_claiming_price,res_finish_position,res_beaten_lengths,res_medication,res_equipment
0,SA,20250101,1,1760,T,S,6,0,0,1,...,97.38,FM,Md Sp Wt,1,119,0,4,3.75,1,b
1,SA,20250101,1,1760,T,S,6,1,2,0,...,97.38,FM,Md Sp Wt,2,124,0,1,0.0,1,b
2,SA,20250101,1,1760,T,S,13,1,0,2,...,97.38,FM,Md Sp Wt,3,124,0,3,1.5,1,b
3,SA,20250101,1,1760,T,S,0,0,0,0,...,97.38,FM,Md Sp Wt,4,119,0,5,5.75,1,b
4,SA,20250101,1,1760,T,S,4,0,1,1,...,97.38,FM,Md Sp Wt,5,124,0,2,0.5,1,


In [6]:
# Display basic summary statistics
engineered_df.describe()

Unnamed: 0,date,race_no,distance_in_yards,trainer_sts_current_meet,trainer_wins_current_meet,trainer_places_current_meet,trainer_shows_cureent_meet,program_number_if_available,morn_line_oddsif_available,year_of_birth,...,jockey_sts_previous_year,jockey_win_previous_year,jockey_place_previous_year,jockey_show_previous_year,jockey_roi_previous_year,best_bris_speed_life,best_bris_speed_most_recent,best_bris_speed_2nd_most_re,best_bris_speed_todays_tra,$2returnoninvestment_no1
count,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,1778.0,...,1778.0,1778.0,1778.0,1778.0,1778.0,1638.0,1775.0,1778.0,1106.0,1778.0
mean,20248340.0,5.73622,1531.40045,9.119798,1.328459,1.293588,1.219348,5.102925,9.86766,20.516873,...,531.355056,89.214831,81.285714,75.714286,-0.007233,82.529054,74.561127,44.774466,81.504521,-0.279229
std,3557.698,2.92884,244.706493,10.594205,2.054434,2.074139,1.682853,2.942725,7.436185,1.436323,...,263.401563,67.397471,51.614338,39.145191,5.762536,12.292305,24.53708,41.537709,12.602179,2.944185
min,20241230.0,1.0,1210.0,0.0,0.0,0.0,0.0,1.0,0.4,15.0,...,-0.98,-1.85,0.0,0.0,-2.0,-0.41,0.0,-2.0,0.0,-2.0
25%,20250100.0,3.0,1320.0,1.0,0.0,0.0,0.0,3.0,4.0,20.0,...,296.0,37.0,42.0,45.0,-0.58,77.0,73.0,0.0,76.0,-0.98
50%,20250110.0,6.0,1430.0,5.0,0.0,0.0,0.0,5.0,8.0,21.0,...,512.0,80.0,73.0,71.0,-0.4,84.5,82.0,67.0,84.0,-0.53
75%,20250120.0,8.0,1760.0,14.0,2.0,2.0,2.0,7.0,15.0,22.0,...,742.0,119.0,111.0,107.0,-0.17,90.0,88.0,84.0,89.0,-0.1
max,20250130.0,11.0,2200.0,61.0,11.0,13.0,8.0,14.0,30.0,22.0,...,1344.0,627.0,627.0,153.0,106.0,108.0,108.0,108.0,108.0,48.47


In [7]:
# Check for missing values
print("Missing values per column:")
print(engineered_df.isnull().sum())

Missing values per column:
track                    0
date                     0
race_no                  0
distance_in_yards        0
surface                  0
                      ... 
res_claiming_price       0
res_finish_position      0
res_beaten_lengths       0
res_medication           0
res_equipment          570
Length: 76, dtype: int64


In [None]:
# Plot a correlation matrix to explore relationships
plt.figure(figsize=(12,10))
# One-hot encode all non-numeric (object) columns.
engineered_df_encoded = pd.get_dummies(engineered_df, drop_first=True)
correlation_matrix = engineered_df_encoded.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Engineered Features')
plt.show()

## Basic Machine Learning Example

In the example below, we assume that one of the columns in the engineered features DataFrame is our target (for example, `res_finish_position`).
We split the data into training and test sets, train a linear regression model, and evaluate its performance using Mean Squared Error.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Set the target column. Adjust this if your target column has a different name.
target_column = "res_finish_position"

# Verify that the target column exists
if target_column not in engineered_df.columns:
    raise ValueError(f"Target column '{target_column}' not found in engineered_df columns: {engineered_df.columns.tolist()}")

# Separate features (X) and target (y)
X = engineered_df.drop(columns=[target_column])
y = engineered_df[target_column]

# Split the data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set: {mse}")