In [1]:
#import modules for analyzing ,plotting, and formatting
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import seaborn as sns


In [2]:
merged_400_800 = pd.read_csv("merged_400m_800m_df.csv")

In [3]:
#import sklearn and relevant packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#separate datasets for the model
#here the model is split 67/33
m400_800_train, m400_800_test = train_test_split(merged_400_800, test_size=0.33, random_state=0)

#gl means grade level
#for this model we will use 400/800m times and grade level
gl_400_800_train = m400_800_train[['400 Meters', '800 Meters', 'Grade Level']]
gl_400_800_test = m400_800_test[['400 Meters', '800 Meters', 'Grade Level']]


In [4]:
#here we separate the features and labels. The features used are the 400m times as well as the grade leve
# this calls for dropping the 800m column for the features df
#next I select only the 800m column for the label df
features_400_train = gl_400_800_train.drop('800 Meters', axis=1)
label_train = gl_400_800_train['800 Meters'].copy()

#same steps are repeated for the test df
features_400_test = gl_400_800_test.drop('800 Meters', axis=1)
label_test = gl_400_800_test['800 Meters'].copy()

In [5]:
features_400_train

Unnamed: 0,400 Meters,Grade Level
1480,60.24,10th Grade
1478,58.08,9th Grade
966,51.15,12th Grade
471,60.66,12th Grade
1199,62.84,12th Grade
...,...,...
763,60.84,10th Grade
835,54.98,10th Grade
1216,91.47,9th Grade
559,59.54,10th Grade


In [6]:
#regression models require numeric data, this means that we have to convert variables from a categorical feature to a binary one.
#grade level is converted to 0s and 1s
# get dummies turned all non-numeric to numeric.
dummy_400_train = pd.get_dummies(features_400_train)

#after converting the grade level column, we reindex the df
dummy_400_train = dummy_400_train.reindex(columns=dummy_400_train.columns, fill_value=0)
features_400_train = dummy_400_train[['400 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_400_train

#the same steps are repeated for the test df
dummy_400_test = pd.get_dummies(features_400_test)

dummy_400_test = dummy_400_test.reindex(columns=dummy_400_test.columns, fill_value=0)
features_400_test = dummy_400_test[['400 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_400_test

Unnamed: 0,400 Meters,Grade Level_9th Grade,Grade Level_10th Grade,Grade Level_11th Grade,Grade Level_12th Grade
319,64.14,0,0,1,0
1041,54.84,0,0,1,0
798,54.44,0,1,0,0
579,50.50,0,0,0,1
414,53.14,0,0,0,1
...,...,...,...,...,...
971,56.53,0,0,0,1
1417,54.90,1,0,0,0
737,65.74,0,1,0,0
477,57.24,0,0,1,0


In [7]:
#here I am selecting the runners with times between 59 and 60 seconds in the 400m
#next I am finding those same runners in the labeled data (800m times)
#I do this as a simple measure so that I can compare the single test predicitions below, and get 
# a sense of which model is predicting closes to what we expect a runner to run. 

times = features_400_train[(features_400_train['400 Meters'] > 59) & (features_400_train['400 Meters'] < 60)]
label_train[times.index].mean()

138.2977777777778

In [8]:
# I used a small definition from Intro to machine learning to display the scores from the cross val.
#this code will be used later to display the scores from cross val cv
def display_scores(scores):
  print('Scores:', scores)
  print('Mean:', scores.mean()),
  print('Standard Deviation:', scores.std())


Here we have the data processed and formatted to run multiple ML models.
These will be LinearRegression, Decision Tree Regression, and Forest Regression.

Our main measure of the data is the RMSE. This function compares the values of the labels to the predictions. We compare this value for all the training data and choose a model that works best before we use the test data.

In [9]:
#after preparing the data we are ready to create the linear regression instance
# and train the model with out data

linreg = LinearRegression()
linreg.fit(features_400_train, label_train)


After creating an instance for each regression model, we fit each of them on the training data. 

After each fit we predict on training data and compare the training labels to analyze the accuracy.


In [10]:
#after training we check out model by predicting using out training data
#we compare out predictions on training data vs. the labels and study the difference
# RMSE is the Root Mean Square Error, which measures the difference between predicted values and the labels. 
# this measure is common and works well with regression models.

train_pred = linreg.predict(features_400_train)
lin_mse = mean_squared_error(label_train, train_pred)
lin_rmse = np.sqrt(lin_mse)
print('linreg RMSE: ', lin_rmse)

linreg RMSE:  8.776397328177701


In [12]:
# the same cross val is run using linear regression

lin_scores = cross_val_score(linreg, features_400_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

#compared to the original linreg, the mean here is marginally better, but it is still insightful to check 
#the original values and compare.

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [ 6.27144725  8.72056697  8.18682836 10.30698787  9.52955247 10.25102466
  7.28613984  8.20567994  8.99320148  9.74589946]
Mean: 8.749732829168252
Standard Deviation: 1.2343570588323662


In [13]:

param_grid = {'fit_intercept': [False]}

grid_search_linreg = GridSearchCV(linreg, param_grid, cv=50, scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search_linreg.fit(features_400_train, label_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search_linreg.best_params_)

Best hyperparameters: {'fit_intercept': False}


In [21]:
print("Best coef:", linreg.coef_)
print("Best intercept:", linreg.intercept_)

Best coef: [ 1.9992472   2.34037533  0.55091021 -0.75330821 -2.13797733]
Best intercept: 21.210862944636446


In [14]:
#here the features and estimators are zip together with the scores
#then they are printed out so we can see which returned the best and the range of values 
#that result from the changes in parameters

cv_res = grid_search_linreg.cv_results_

for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
  print(np.sqrt(-mean_score), params)

8.832054689566819 {'fit_intercept': False}


In [15]:
final_model_linreg = grid_search_linreg.best_estimator_
final_linreg_pred = final_model_linreg.predict(features_400_test)

final_mse_linreg = mean_squared_error(label_test, final_linreg_pred)
final_rmse_linreg = np.sqrt(final_mse_linreg)
print(final_rmse_linreg) 

8.602721089208453


In [16]:
final_model_linreg.score(features_400_test, label_test)

0.5876326698565723

In [17]:
from sklearn.metrics import r2_score
score = r2_score(label_test, final_linreg_pred)
print("The accuracy of our model is {}%".format(round(score, 2) *100))

The accuracy of our model is 59.0%
