In [32]:
#import modules for analyzing ,plotting, and formatting
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import seaborn as sns
import joblib 
import os

In [2]:
os.chdir(r"C:\Users\5luca\Documents\Python\Projects\Track_DS\1merged_df")
merged_800_1500 = pd.read_csv("merged_800m_1500m_df.csv")

In [3]:
#import sklearn and relevant packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

#separate datasets for the model
#here the model is split 67/33
m800_1500_train, m800_1500_test = train_test_split(merged_800_1500, test_size=0.33, random_state=0)

#gl means grade level
#for this model we will use 800/1500m times and grade level
gl_800_1500_train = m800_1500_train[['800 Meters', '1500 Meters', 'Grade Level']]
gl_800_1500_test = m800_1500_test[['800 Meters', '1500 Meters', 'Grade Level']]


In [4]:
#here we separate the features and labels. The features used are the 800m times as well as the grade leve
# this calls for dropping the 1500m column for the features df
#next I select only the 1500m column for the label df
features_800_train = gl_800_1500_train.drop('1500 Meters', axis=1)
label_train = gl_800_1500_train['1500 Meters'].copy()

#same steps are repeated for the test df
features_800_test = gl_800_1500_test.drop('1500 Meters', axis=1)
label_test = gl_800_1500_test['1500 Meters'].copy()

In [5]:
features_800_train

Unnamed: 0,800 Meters,Grade Level
1380,123.60,12th Grade
1287,151.10,10th Grade
1081,124.52,12th Grade
572,123.79,11th Grade
1775,133.60,11th Grade
...,...,...
1033,123.30,12th Grade
1731,131.30,11th Grade
763,155.00,9th Grade
835,172.20,9th Grade


In [6]:
#regression models require numeric data, this means that we have to convert variables from a categorical feature to a binary one.
#grade level is converted to 0s and 1s
# get dummies turned all non-numeric to numeric.
dummy_800_train = pd.get_dummies(features_800_train)

#after converting the grade level column, we reindex the df
dummy_800_train = dummy_800_train.reindex(columns=dummy_800_train.columns, fill_value=0)
features_800_train = dummy_800_train[['800 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_800_train

#the same steps are repeated for the test df
dummy_800_test = pd.get_dummies(features_800_test)

dummy_800_test = dummy_800_test.reindex(columns=dummy_800_test.columns, fill_value=0)
features_800_test = dummy_800_test[['800 Meters', 'Grade Level_9th Grade', 'Grade Level_10th Grade', 'Grade Level_11th Grade', 'Grade Level_12th Grade']]
features_800_test

Unnamed: 0,800 Meters,Grade Level_9th Grade,Grade Level_10th Grade,Grade Level_11th Grade,Grade Level_12th Grade
568,137.30,0,1,0,0
1180,122.30,0,1,0,0
1119,128.13,0,0,0,1
1820,156.00,0,0,0,1
399,146.00,1,0,0,0
...,...,...,...,...,...
1880,161.00,0,1,0,0
1487,130.60,0,0,1,0
564,150.00,1,0,0,0
294,148.42,0,1,0,0


In [7]:
#here I am selecting the runners with times between 59 and 60 seconds in the 800m
#next I am finding those same runners in the labeled data (1500m times)
#I do this as a simple measure so that I can compare the single test predicitions below, and get 
# a sense of which model is predicting closes to what we expect a runner to run. 

times = features_800_train[(features_800_train['800 Meters'] > 119) & (features_800_train['800 Meters'] < 120)]
label_train[times.index].mean()

255.7447222222222

In [8]:
# I used a small definition from Intro to machine learning to display the scores from the cross val.
#this code will be used later to display the scores from cross val cv
def display_scores(scores):
  print('Scores:', scores)
  print('Mean:', scores.mean()),
  print('Standard Deviation:', scores.std())


Here we have the data processed and formatted to run multiple ML models.
These will be LinearRegression, Decision Tree Regression, and Forest Regression.

Our main measure of the data is the RMSE. This function compares the values of the labels to the predictions. We compare this value for all the training data and choose a model that works best before we use the test data.

In [9]:
#after preparing the data we are ready to create the linear regression instance
# and train the model with out data

linreg = LinearRegression()
linreg.fit(features_800_train, label_train)

#here we are importing another regression model to test the difference,
# and checking is a more powerful model will produce better results.
#we fit this model and predict on our training data as we did before.

tree_reg = DecisionTreeRegressor()
tree_reg.fit(features_800_train, label_train)

#the most powerful model used in this project
forest_reg = RandomForestRegressor()
forest_reg.fit(features_800_train, label_train)

#we import the one more regression model which will be the most powerful
#the steps will be the same as before

forest_reg = RandomForestRegressor()
forest_reg.fit(features_800_train, label_train)

RandomForestRegressor()

After creating an instance for each regression model, we fit each of them on the training data. 

After each fit we predict on training data and compare the training labels to analyze the accuracy.


In [10]:
#after training we check out model by predicting using out training data
#we compare out predictions on training data vs. the labels and study the difference
# RMSE is the Root Mean Square Error, which measures the difference between predicted values and the labels. 
# this measure is common and works well with regression models.

train_pred = linreg.predict(features_800_train)
lin_mse = mean_squared_error(label_train, train_pred)
lin_rmse = np.sqrt(lin_mse)
print('linreg RMSE: ', lin_rmse)

linreg RMSE:  14.54682161150026


In [11]:
#predicting with the Decision Tree Regression 
tree_pred = tree_reg.predict(features_800_train)
tree_mse = mean_squared_error(label_train, tree_pred)
tree_rmse = np.sqrt(tree_mse)
print('decision tree RMSE: ', tree_rmse)

decision tree RMSE:  7.786857701352396


In [12]:
#prediction with Forest Regression
forest_pred = forest_reg.predict(features_800_train)
forest_mse = mean_squared_error(label_train, forest_pred)
forest_rmse = np.sqrt(forest_mse)

print('forest RMSE:',  forest_rmse)

forest RMSE: 9.126253965264246


In [13]:
#here is a small test using a runner with a 60 second 800m.
# the number 1 represents a grade level.
#using this we can run a simple test to see how acurate a single prediction is.
#the array corresponds to 800m time, 9th grade, 10th grade, 11th grade, and 12th grade (in this order).

test_arr = np.array([120,0,0,0,1])
test_arr = test_arr.reshape(1,-1)

#predicting a single result with linreg
test_pred_lin = linreg.predict(test_arr)
test_pred_lin



array([255.68008304])

In [14]:
#here we run the same test as before, with a single sample, and compare how the decision tree predicts.
#although the decision tree has a lower RMSE than linreg, the prediction is futher off what we would expect.
# this shows that the decision tree likely is overfitting the data

#predicting a single result with Decison tree
test_pred_tree = tree_reg.predict(test_arr)
test_pred_tree



array([251.31])

In [15]:
# predicting a single result with forest regression 
test_pred_forest = forest_reg.predict(test_arr)
test_pred_forest



array([254.6653])

In [16]:
# the same cross val is run using linear regression

lin_scores = cross_val_score(linreg, features_800_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

#compared to the original linreg, the mean here is marginally better, but it is still insightful to check 
#the original values and compare.

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [12.92322322 12.8017793  12.30115494 14.35071304 21.87252578 15.1711855
 12.62522304 13.17126945 13.63156169 14.68578445]
Mean: 14.353442040808577
Standard Deviation: 2.6624468673682355


In [17]:
#here I import cross_val_score which will allow me to test the decision tree 
#and improve how it fits by taking small chunks of the data and training on each chunk
# this results in a better fitting.

tree_scores = cross_val_score(tree_reg, features_800_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

# displaying the scors shows a larger mean
#this is expected because before, the data was being overfit
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

Scores: [19.20613756 18.47652506 16.85349257 21.47565491 26.76202547 20.47335056
 16.04833702 18.02050285 17.81468564 19.10848664]
Mean: 19.423919828302566
Standard Deviation: 2.8768745124217756


In [18]:
#just as with the decision tree, randome forest was likely overfitting the training data
# when using crossval we see that the mean is a amost twice as bad
# still, this model produces the best results
#this is also expected because this model is more powerful 

forest_scores = cross_val_score(forest_reg, features_800_train, label_train, 
                         scoring='neg_mean_squared_error', cv=10)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [15.57834802 16.56018062 13.56016794 18.20237114 23.93699855 17.24383253
 13.72901173 15.1609527  15.24928928 16.1757079 ]
Mean: 16.539686043985057
Standard Deviation: 2.8174650815274296


In [19]:
#after going through the models, we import one last package, GridSearchCV
#this sorts through many parameters and returns the best one.

param_grid = [
    {'n_estimators': [3,10,30], 'max_features': [2,4,6,8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2,3,4]}
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

#here the grid search model is used to fit the training data
#next we print out the best params and best estimators
grid_search.fit(features_800_train, label_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'max_features': 4, 'n_estimators': 30}
RandomForestRegressor(max_features=4, n_estimators=30)


30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\5luca\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\5luca\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\5luca\anaconda3\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\5luca\anaconda3\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\5luca\anacon

In [20]:
#here the features and estimators are zip together with the scores
#then they are printed out so we can see which returned the best and the range of values 
#that result from the changes in parameters

cv_res = grid_search.cv_results_

for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
  print(np.sqrt(-mean_score), params)

18.11195527871897 {'max_features': 2, 'n_estimators': 3}
17.28016193106719 {'max_features': 2, 'n_estimators': 10}
17.133489351222405 {'max_features': 2, 'n_estimators': 30}
17.768946542242368 {'max_features': 4, 'n_estimators': 3}
17.159524590969735 {'max_features': 4, 'n_estimators': 10}
16.853409174241442 {'max_features': 4, 'n_estimators': 30}
nan {'max_features': 6, 'n_estimators': 3}
nan {'max_features': 6, 'n_estimators': 10}
nan {'max_features': 6, 'n_estimators': 30}
nan {'max_features': 8, 'n_estimators': 3}
nan {'max_features': 8, 'n_estimators': 10}
nan {'max_features': 8, 'n_estimators': 30}
19.131521835094883 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
19.212772869204596 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
19.13210387012403 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
19.07403486649624 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
18.986678590604885 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3

In [21]:
#lastly we isolate the best estimators from the model
#we then predict the TEST data and compare to the test labels
final_model = grid_search.best_estimator_
final_pred = final_model.predict(features_800_test)

final_mse = mean_squared_error(label_test, final_pred)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

16.98522599605781


In [22]:
from sklearn.metrics import r2_score
score = r2_score(label_test, final_pred)
print("The accuracy of our model is {}%".format(round(score, 2) *100))

The accuracy of our model is 66.0%


In [26]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)

scaler = StandardScaler()
features_800_train = scaler.fit_transform(features_800_train)
features_800_test = scaler.fit_transform(features_800_test)


param_grid =[ 
    {           
               'n_estimators': [60,80,100],
                'max_depth': [3,4,5],
                'max_leaf_nodes':[11,16,20],
#                'n_jobs' : [-1],
#              'min_samples_leaf':[1,2,3],
#              'min_samples_split':[2,3],
              'criterion': ['friedman_mse','squared_error'],
              'max_features': [None, 'sqrt', 'log2']
    }
]
grid_search_rf = GridSearchCV(rfr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)
grid_search_rf.fit(features_800_train, label_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search_rf.best_params_)
print("Best hyperparameters:", np.sqrt(-grid_search_rf.best_score_))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV 1/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-196.066 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-188.474 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-368.418 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-174.542 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-209.564 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=80;, score=-195.744 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimat

[CV 5/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-259.222 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=60;, score=-262.443 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=60;, score=-231.659 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=60;, score=-402.427 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=60;, score=-234.350 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=60;, score=-252.957 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=80;, score=-264.939 total time=   0.0s
[CV 2/5] END criter

[CV 4/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=16, n_estimators=100;, score=-237.036 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=16, n_estimators=100;, score=-259.222 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=60;, score=-262.443 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=60;, score=-231.659 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=60;, score=-402.427 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=60;, score=-234.350 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=60;, score=-252.957 total time=   0.0s
[CV 1/5] END crite

[CV 4/5] END criterion=friedman_mse, max_depth=4, max_features=None, max_leaf_nodes=20, n_estimators=100;, score=-172.259 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=4, max_features=None, max_leaf_nodes=20, n_estimators=100;, score=-203.537 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=60;, score=-238.159 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=60;, score=-207.032 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=60;, score=-381.555 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=60;, score=-210.224 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=60;, score=-235.778 total time=   0.0s
[CV 1/5] END crite

[CV 4/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=11, n_estimators=100;, score=-212.253 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=11, n_estimators=100;, score=-239.074 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=60;, score=-223.667 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=60;, score=-197.846 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=60;, score=-370.943 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=60;, score=-198.009 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=60;, score=-227.800 total time=   0.0s
[CV 1/5] END crite

[CV 4/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=16, n_estimators=100;, score=-172.933 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=16, n_estimators=100;, score=-205.050 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=60;, score=-195.630 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=60;, score=-198.189 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=60;, score=-363.576 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=60;, score=-174.082 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=60;, score=-206.985 total time=   0.0s
[CV 1/5] END crite

[CV 4/5] END criterion=friedman_mse, max_depth=5, max_features=sqrt, max_leaf_nodes=20, n_estimators=100;, score=-187.436 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=5, max_features=sqrt, max_leaf_nodes=20, n_estimators=100;, score=-220.642 total time=   0.0s
[CV 1/5] END criterion=friedman_mse, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-241.431 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-208.632 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-383.445 total time=   0.0s
[CV 4/5] END criterion=friedman_mse, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-214.378 total time=   0.0s
[CV 5/5] END criterion=friedman_mse, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-239.246 total time=   0.0s
[CV 1/5] END crite

[CV 3/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=100;, score=-368.350 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=100;, score=-174.205 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=11, n_estimators=100;, score=-209.178 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=16, n_estimators=60;, score=-196.066 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=16, n_estimators=60;, score=-188.474 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=16, n_estimators=60;, score=-368.418 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=3, max_features=None, max_leaf_nodes=16, n_estimators=60;, score=-174.542 total time=   0.0s
[CV 5/5] E

[CV 1/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=100;, score=-266.118 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=100;, score=-236.928 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=100;, score=-408.151 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=100;, score=-237.036 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=16, n_estimators=100;, score=-259.222 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=20, n_estimators=60;, score=-262.443 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=3, max_features=sqrt, max_leaf_nodes=20, n_estimators=60;, score=-231.659 total time=   0.0s
[CV 3/5]

[CV 5/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=80;, score=-256.413 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=100;, score=-266.118 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=100;, score=-236.928 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=100;, score=-408.151 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=100;, score=-237.036 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=3, max_features=log2, max_leaf_nodes=20, n_estimators=100;, score=-259.222 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=None, max_leaf_nodes=11, n_estimators=60;, score=-193.777 total time=   0.0s
[CV 2/5]

[CV 4/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=80;, score=-212.462 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=80;, score=-237.038 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-241.568 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-210.174 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-384.306 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-212.253 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=4, max_features=sqrt, max_leaf_nodes=11, n_estimators=100;, score=-239.074 total time=   0.0s
[CV 1/5]

[CV 2/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=80;, score=-198.537 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=80;, score=-371.230 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=80;, score=-198.829 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=80;, score=-227.089 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=100;, score=-224.168 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=100;, score=-198.837 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=4, max_features=log2, max_leaf_nodes=16, n_estimators=100;, score=-371.801 total time=   0.0s
[CV 4/5] E

[CV 1/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=80;, score=-194.772 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=80;, score=-198.111 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=80;, score=-363.328 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=80;, score=-173.176 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=80;, score=-204.941 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=100;, score=-193.948 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=5, max_features=None, max_leaf_nodes=20, n_estimators=100;, score=-197.062 total time=   0.0s
[CV 3/5] EN

[CV 4/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-214.378 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=60;, score=-239.246 total time=   0.0s
[CV 1/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=80;, score=-244.594 total time=   0.0s
[CV 2/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=80;, score=-211.634 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=80;, score=-386.440 total time=   0.0s
[CV 4/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=80;, score=-217.274 total time=   0.0s
[CV 5/5] END criterion=squared_error, max_depth=5, max_features=log2, max_leaf_nodes=11, n_estimators=80;, score=-242.035 total time=   0.0s
[CV 1/5] END 

In [27]:
#we then predict the TEST data and compare to the test labels
model_rf = grid_search_rf.best_estimator_
pred_rf = model_rf.predict(features_800_train)

mse_rf = mean_squared_error(label_train, pred_rf)
rmse_rf = np.sqrt(mse_rf)
print(rmse_rf)

14.16548663918681


In [28]:
from sklearn.metrics import r2_score
score_rf = r2_score(label_train, pred_rf)
print("The accuracy of our model is {}%".format(round(score_rf, 2) *100))

The accuracy of our model is 73.0%


In [29]:
#we then predict the TEST data and compare to the test labels
model_rft = grid_search_rf.best_estimator_
pred_rft = model_rf.predict(features_800_test)

mse_rft = mean_squared_error(label_test, pred_rft)
rmse_rft = np.sqrt(mse_rft)
print(rmse_rft)

14.789777835157963


In [30]:
score_rft = r2_score(label_test, pred_rft)
print("The accuracy of our model is {}%".format(round(score_rft, 2) *100))

The accuracy of our model is 74.0%


In [34]:
joblib.dump(mse_rft, "mse_rft.pkl")

['mse_rft.pkl']