In [None]:
#import modules for analyzing ,plotting, and formatting
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import seaborn as sns


In [None]:
merged_400_1500 = pd.read_csv("merged_400m_1500m_df.csv")

In [None]:
#import sklearn and relevant packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#separate datasets for the model
#here the model is split 67/33
m400_1500_train, m400_1500_test = train_test_split(merged_400_1500, test_size=0.33, random_state=0)

#gl means grade level
#for this model we will use 400/1500m times and grade level
gl_400_1500_train = m400_1500_train[['400 Meters', '1500 Meters', 'Grade Level']]
gl_400_1500_test = m400_1500_test[['400 Meters', '1500 Meters', 'Grade Level']]


In [None]:
#here we separate the features and labels. The features used are the 400m times as well as the grade leve
# this calls for dropping the 1500m column for the features df
#next I select only the 1500m column for the label df
features_400_train = gl_400_1500_train.drop('1500 Meters', axis=1)
label_train = gl_400_1500_train['1500 Meters'].copy()

#same steps are repeated for the test df
features_400_test = gl_400_1500_test.drop('1500 Meters', axis=1)
label_test = gl_400_1500_test['1500 Meters'].copy()

In [None]:
features_400_train

Unnamed: 0,400 Meters,Grade Level
605,55.76,11th Grade
481,60.04,11th Grade
590,55.40,11th Grade
667,61.74,10th Grade
672,72.50,9th Grade
...,...,...
1033,54.88,12th Grade
763,57.71,12th Grade
835,56.29,11th Grade
559,53.94,11th Grade


In [None]:
#regression models require numeric data, this means that we have to convert variables from a categorical feature to a binary one.
#grade level is converted to 0s and 1s
# get dummies turned all non-numeric to numeric.
dummy_400_train = pd.get_dummies(features_400_train)

#after converting the grade level column, we reindex the df
dummy_400_train = dummy_400_train.reindex(columns=dummy_400_train.columns, fill_value=0)
features_400_train = dummy_400_train[['400 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_400_train

#the same steps are repeated for the test df
dummy_400_test = pd.get_dummies(features_400_test)

dummy_400_test = dummy_400_test.reindex(columns=dummy_400_test.columns, fill_value=0)
features_400_test = dummy_400_test[['400 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_400_test

Unnamed: 0,400 Meters,Grade Level_9th Grade,Grade Level_10th Grade,Grade Level_11th Grade,Grade Level_12th Grade
671,57.60,0,1,0,0
877,56.90,0,0,0,1
971,55.84,0,0,0,1
184,53.90,0,1,0,0
1137,53.64,0,0,0,1
...,...,...,...,...,...
974,56.04,0,0,0,1
37,56.90,0,0,0,1
143,50.46,0,0,0,1
158,68.30,0,0,0,1


In [None]:
#here I am selecting the runners with times between 59 and 60 seconds in the 400m
#next I am finding those same runners in the labeled data (1500m times)
#I do this as a simple measure so that I can compare the single test predicitions below, and get 
# a sense of which model is predicting closes to what we expect a runner to run. 

times = features_400_train[(features_400_train['400 Meters'] > 59) & (features_400_train['400 Meters'] < 60)]
label_train[times.index].mean()

288.91527272727274

In [None]:
# I used a small definition from Intro to machine learning to display the scores from the cross val.
#this code will be used later to display the scores from cross val cv
def display_scores(scores):
  print('Scores:', scores)
  print('Mean:', scores.mean()),
  print('Standard Deviation:', scores.std())


Here we have the data processed and formatted to run multiple ML models.
These will be LinearRegression, Decision Tree Regression, and Forest Regression.

Our main measure of the data is the RMSE. This function compares the values of the labels to the predictions. We compare this value for all the training data and choose a model that works best before we use the test data.

In [None]:
#after preparing the data we are ready to create the linear regression instance
# and train the model with out data

linreg = LinearRegression()
linreg.fit(features_400_train, label_train)

#here we are importing another regression model to test the difference,
# and checking is a more powerful model will produce better results.
#we fit this model and predict on our training data as we did before.

tree_reg = DecisionTreeRegressor()
tree_reg.fit(features_400_train, label_train)

#the most powerful model used in this project
forest_reg = RandomForestRegressor()
forest_reg.fit(features_400_train, label_train)

#we import the one more regression model which will be the most powerful
#the steps will be the same as before

forest_reg = RandomForestRegressor()
forest_reg.fit(features_400_train, label_train)

After creating an instance for each regression model, we fit each of them on the training data. 

After each fit we predict on training data and compare the training labels to analyze the accuracy.


In [None]:
#after training we check out model by predicting using out training data
#we compare out predictions on training data vs. the labels and study the difference
# RMSE is the Root Mean Square Error, which measures the difference between predicted values and the labels. 
# this measure is common and works well with regression models.

train_pred = linreg.predict(features_400_train)
lin_mse = mean_squared_error(label_train, train_pred)
lin_rmse = np.sqrt(lin_mse)
print('linreg RMSE: ', lin_rmse)

linreg RMSE:  25.32408039414019


In [None]:
#predicting with the Decision Tree Regression 
tree_pred = tree_reg.predict(features_400_train)
tree_mse = mean_squared_error(label_train, tree_pred)
tree_rmse = np.sqrt(tree_mse)
print('decision tree RMSE: ', tree_rmse)

decision tree RMSE:  8.980467950783417


In [None]:
#prediction with Forest Regression
forest_pred = forest_reg.predict(features_400_train)
forest_mse = mean_squared_error(label_train, forest_pred)
forest_rmse = np.sqrt(forest_mse)

print('forest RMSE:',  forest_rmse)

forest RMSE: 13.023350509730088


In [None]:
#here is a small test using a runner with a 60 second 400m.
# the number 1 represents a grade level.
#using this we can run a simple test to see how acurate a single prediction is.
#the array corresponds to 400m time, 9th grade, 10th grade, 11th grade, and 12th grade (in this order).

test_arr = np.array([60,0,0,0,1])
test_arr = test_arr.reshape(1,-1)

#predicting a single result with linreg
test_pred_lin = linreg.predict(test_arr)
test_pred_lin



array([295.69639602])

In [None]:
#here we run the same test as before, with a single sample, and compare how the decision tree predicts.
#although the decision tree has a lower RMSE than linreg, the prediction is futher off what we would expect.
# this shows that the decision tree likely is overfitting the data

#predicting a single result with Decison tree
test_pred_tree = tree_reg.predict(test_arr)
test_pred_tree



array([271.8])

In [None]:
# predicting a single result with forest regression 
test_pred_forest = forest_reg.predict(test_arr)
test_pred_forest



array([297.96296667])

In [None]:
# the same cross val is run using linear regression

lin_scores = cross_val_score(linreg, features_400_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

#compared to the original linreg, the mean here is marginally better, but it is still insightful to check 
#the original values and compare.

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [22.91634618 26.94032175 35.41110396 22.45967796 18.59265706 20.1902799
 20.55152456 34.37641886 24.97160794 22.62253861]
Mean: 24.903247676084476
Standard Deviation: 5.480911579428736


In [None]:
#here I import cross_val_score which will allow me to test the decision tree 
#and improve how it fits by taking small chunks of the data and training on each chunk
# this results in a better fitting.

tree_scores = cross_val_score(tree_reg, features_400_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

# displaying the scors shows a larger mean
#this is expected because before, the data was being overfit
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

Scores: [31.34992399 44.63369298 37.89607286 32.07949664 27.16141386 28.38631476
 27.61306344 43.65391578 28.9352337  28.45967614]
Mean: 33.01688041584433
Standard Deviation: 6.308079119910784


In [None]:
#just as with the decision tree, randome forest was likely overfitting the training data
# when using crossval we see that the mean is a amost twice as bad
# still, this model produces the best results
#this is also expected because this model is more powerful 

forest_scores = cross_val_score(forest_reg, features_400_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [25.75849586 35.90964503 36.2642136  25.34455937 22.06336842 26.02470607
 22.84815079 39.77984848 26.48499063 26.2493657 ]
Mean: 28.672734395236073
Standard Deviation: 5.901029028074876


In [None]:
#after going through the models, we import one last package, GridSearchCV
#this sorts through many parameters and returns the best one.

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]}
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

#here the grid search model is used to fit the training data
#next we print out the best params and best estimators
grid_search.fit(features_400_train, label_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'max_features': 4, 'n_estimators': 30}
RandomForestRegressor(max_features=4, n_estimators=30)


In [None]:
#here the features and estimators are zip together with the scores
#then they are printed out so we can see which returned the best and the range of values 
#that result from the changes in parameters

cv_res = grid_search.cv_results_

for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
  print(np.sqrt(-mean_score), params)

30.561859096972178 {'max_features': 2, 'n_estimators': 3}
30.050159212961496 {'max_features': 2, 'n_estimators': 10}
29.249118792720985 {'max_features': 2, 'n_estimators': 30}
32.21142132514255 {'max_features': 4, 'n_estimators': 3}
29.658109012314938 {'max_features': 4, 'n_estimators': 10}
29.06264528408334 {'max_features': 4, 'n_estimators': 30}
31.265504170952504 {'max_features': 6, 'n_estimators': 3}
29.50909699546451 {'max_features': 6, 'n_estimators': 10}
29.305172781047382 {'max_features': 6, 'n_estimators': 30}
30.242449444263606 {'max_features': 8, 'n_estimators': 3}
29.68181451864192 {'max_features': 8, 'n_estimators': 10}
29.29974283816185 {'max_features': 8, 'n_estimators': 30}
33.35184154727088 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
33.17201865801009 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
32.836180431374494 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
32.76875824097099 {'bootstrap': False, 'max_features': 3, 'n_estimat

In [None]:
#lastly we isolate the best estimators from the model
#we then predict the TEST data and compare to the test labels
final_model = grid_search.best_estimator_
final_pred = final_model.predict(features_400_test)

final_mse = mean_squared_error(label_test, final_pred)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

29.435421558177417


In [None]:
test_pred = final_model.predict(test_arr)
print(test_pred)

[291.14344444]


