In [2]:
from preprocessing import data_from_sql, preprocess_users, preprocess_ascents, merge_user_ascents

In [10]:
df_user, df_ascent, df_grade = data_from_sql("./database.sqlite")
df_user_preprocessed = preprocess_users(df_user)
df_ascents_preprocessed = preprocess_ascents(df_ascent, df_grade)
data = merge_user_ascents(df_user_preprocessed, df_ascents_preprocessed)

In [29]:
X_data = np.array(data[['difference', 'sex', 'height', 'weight']])
y_data = np.array(data['usa_boulders'])
print(X_data)
print(y_data)

[[6 0 177 73]
 [6 0 180 78]
 [5 1 165 58]
 ...
 [8 0 185 73]
 [2 0 175 68]
 [1 0 185 68]]
[ 7  8  4 ... 10  4  3]


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import numpy as np

In [31]:
scaler = StandardScaler()
normalized_data = scaler.fit_transform(X_data)
X = normalized_data
y = np.array(y_data)
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=1729)

In [32]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

LinearRegression()

In [33]:
n = 25 # enough data points to use high 'n' values
weights_strategy = 'uniform' 
k_neighbors_regression = KNeighborsRegressor(n_neighbors=n, weights=weights_strategy)
k_neighbors_regression.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=25)

In [34]:
split = 'best' 
depth = 3 
decision_tree_regression = DecisionTreeRegressor(splitter=split, max_depth=depth, random_state=1729)
decision_tree_regression.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3, random_state=1729)

In [35]:
estimators = 100 
forest_depth = 4

# parameter dict
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_forest_regression = RandomForestRegressor(n_estimators=estimators, max_depth=forest_depth, random_state=1729)
random_forest_regression.fit(X_train, y_train)

RandomForestRegressor(max_depth=4, random_state=1729)

In [36]:
C = 1.0 # default values
epsilon = 0.1

# parameter dict
svr_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [int(x) for x in np.linspace(start=1, stop=1000, num=10)]},
    {"kernel": ["linear"], "C": [int(x) for x in np.linspace(start=1, stop=1000, num=10)]},
]

support_vector_regression = SVR(C=C, epsilon=epsilon)
support_vector_regression.fit(X_train, y_train)

SVR()

In [40]:
from sklearn.model_selection import RandomizedSearchCV
# hyperparameter optimize rf and svr
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=1729, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   4.0s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=600; total time=   4.1s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1400; total time=   9.3s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   3.1s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   2.5s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=6



[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  11.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   5.5s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   3.1s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   2.5s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   4.1s
[CV] END bootstrap=False, max_depth=40, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=600; total time=   4.7s
[CV] END bootstrap=True, max_depth=60, ma

[CV] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1800; total time=  12.1s
[CV] END bootstrap=True, max_depth=60, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=   8.3s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1800; total time=   6.0s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1400; total time=   9.6s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1600; total time=   5.2s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600; total time=   2.9s
[CV] END bootstrap=False, max_depth=

[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  11.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1400; total time=   5.7s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=600; total time=   3.0s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   2.6s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   2.5s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   4.2s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=   1.0s
[CV] END bootstrap=False, max_depth=40, 

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=1729, verbose=2)

In [41]:
svr = SVR()
svr_random = RandomizedSearchCV(estimator = svr, param_distributions = svr_parameters, n_iter = 24, cv = 3, verbose=2, random_state=1729, n_jobs = -1)
svr_random.fit(X_train, y_train)
svr_random.best_params_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


{'kernel': 'rbf', 'gamma': 0.001, 'C': 223}

In [42]:
from sklearn.metrics import mean_squared_error # metrics for random optimized regressors
random_dict = {'rf': rf_random.best_estimator_, 'svr': svr_random.best_estimator_}
for key in random_dict:
    y_pred = random_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

rf sample predictions:
    predicted: 4.1236684656222415, actual: 6
    predicted: 3.9832222652796023, actual: 4
    predicted: 2.919879461810426, actual: 1
    predicted: 7.630645645258491, actual: 4
    predicted: 4.848052967048011, actual: 10
    predicted: 6.911023094720677, actual: 10
    predicted: 7.76589177280046, actual: 10
    predicted: 7.82690033469699, actual: 7
    predicted: 4.351757935110524, actual: 3
    predicted: 8.298593429132774, actual: 8
rf regression MSE is 4.352713508601094
svr sample predictions:
    predicted: 4.360999000892484, actual: 6
    predicted: 5.051317566514392, actual: 4
    predicted: 4.560535873856843, actual: 1
    predicted: 7.205903888270477, actual: 4
    predicted: 5.720285125385992, actual: 10
    predicted: 7.153033437236267, actual: 10
    predicted: 7.221322760233811, actual: 10
    predicted: 7.450398905283286, actual: 7
    predicted: 4.324304044416117, actual: 3
    predicted: 8.430534120328097, actual: 8
svr regression MSE is 4.6480

In [43]:
from sklearn.model_selection import GridSearchCV

rfgs_params = { # hyperparameter optimize a support vector regressor
    'bootstrap': [True],
    'max_depth': [5, 10, 15, 20],
    'max_features': ['sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [1000, 1300, 1600, 2000]
}

gsrf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=gsrf, param_grid=rfgs_params, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 10, 15, 20],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [1000, 1300, 1600, 2000]},
             verbose=2)

In [44]:
svrgs_params = { # hyperparameter optimize a support vector regressor
    'kernel': ['rbf'],
    'gamma': [0.0001, 0.05, 0.01, 0.09],
    'C': [200, 250, 300, 223]
}

svrgs = SVR()
grid_search_svr = GridSearchCV(estimator=svrgs, param_grid=svrgs_params, cv=3, n_jobs=-1, verbose=2)
grid_search_svr.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [200, 250, 300, 223],
                         'gamma': [0.0001, 0.05, 0.01, 0.09],
                         'kernel': ['rbf']},
             verbose=2)

In [45]:
grid_dict = {'rf': grid_search.best_estimator_, 'svr': grid_search_svr.best_estimator_} # display grid tuned metrics
for key in grid_dict:
    y_pred = grid_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

rf sample predictions:
    predicted: 4.199245113128766, actual: 6
    predicted: 4.074517744153115, actual: 4
    predicted: 3.3039708971195396, actual: 1
    predicted: 7.498822822100991, actual: 4
    predicted: 5.756652735929882, actual: 10
    predicted: 6.953544636540777, actual: 10
    predicted: 7.76078779928991, actual: 10
    predicted: 7.816327829703326, actual: 7
    predicted: 4.217667196456257, actual: 3
    predicted: 8.264648107089348, actual: 8
rf regression MSE is 4.276180995902377
svr sample predictions:
    predicted: 3.781155840599749, actual: 6
    predicted: 4.489596111548932, actual: 4
    predicted: 3.413508937185836, actual: 1
    predicted: 7.541876419489367, actual: 4
    predicted: 7.169393468855671, actual: 10
    predicted: 6.887835942580577, actual: 10
    predicted: 7.596287417976041, actual: 10
    predicted: 7.6420655306370096, actual: 7
    predicted: 3.7310775355584243, actual: 3
    predicted: 8.863309799409304, actual: 8
svr regression MSE is 4.35

In [46]:
from sklearn.metrics import mean_squared_error
regressor_dict = {'lr':linear_regression, # display each model's performance on the training set
                  'knn': k_neighbors_regression, 
                  'dtree': decision_tree_regression, 
                  'rforest': random_forest_regression,
                  'svr': support_vector_regression,
                  'grid_rforest': grid_search.best_estimator_,
                  'grid_svr': grid_search_svr.best_estimator_
                 }

for key in regressor_dict:
    y_pred = regressor_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

lr sample predictions:
    predicted: 5.382374847566799, actual: 6
    predicted: 5.780610408739221, actual: 4
    predicted: 5.684247177998455, actual: 1
    predicted: 7.027043268705622, actual: 4
    predicted: 6.042409732848508, actual: 10
    predicted: 7.22472710620084, actual: 10
    predicted: 7.023913163827354, actual: 10
    predicted: 7.330480651576413, actual: 7
    predicted: 5.387070004884203, actual: 3
    predicted: 7.650740283233258, actual: 8
lr regression MSE is 4.892514235764533
knn sample predictions:
    predicted: 4.12, actual: 6
    predicted: 4.4, actual: 4
    predicted: 3.72, actual: 1
    predicted: 7.36, actual: 4
    predicted: 6.32, actual: 10
    predicted: 7.04, actual: 10
    predicted: 7.88, actual: 10
    predicted: 8.04, actual: 7
    predicted: 4.24, actual: 3
    predicted: 8.24, actual: 8
knn regression MSE is 4.368910659395143
dtree sample predictions:
    predicted: 3.782258064516129, actual: 6
    predicted: 3.782258064516129, actual: 4
    pr

In [49]:
# using rforest, create pipeline with StandardScaler to save
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(scaler, grid_search.best_estimator_)
y_pred = pipeline.predict(X_data)
y_true = y_data

print(mean_squared_error(y_pred, y_true))

3.99762646441334


In [50]:
import pickle # save model pipeline
with open(f'dumped-pipeline.pkl', 'wb') as fid:
    pickle.dump(pipeline, fid)

[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=   4.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   2.3s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.0s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=   4.3s
[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   3.8s
[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   3.8s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   3.6s
[CV] END bootstrap=True, max_dept

[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=600; total time=   3.7s
[CV] END bootstrap=False, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1400; total time=  10.3s
[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=  10.4s
[CV] END bootstrap=True, max_depth=70, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=800; total time=   2.8s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1600; total time=   5.9s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   8.7s
[CV] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=   2.0s
[CV] END bootstrap=True, max_depth=

[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   8.3s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=   3.2s
[CV] END bootstrap=False, max_depth=70, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=600; total time=   4.6s
[CV] END bootstrap=False, max_depth=80, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000; total time=   8.4s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   4.7s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1600; total time=   7.1s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   8.7s
[CV] END bootstrap=False, max_

[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   8.5s
[CV] END bootstrap=False, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1400; total time=   9.9s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1200; total time=   6.6s
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   4.7s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   4.6s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=1600; total time=   5.3s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   6.7s
[CV] END bootstrap=False, max_

[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=1200; total time=   8.3s
[CV] END bootstrap=False, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=1400; total time=  10.0s
[CV] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=1600; total time=  10.2s
[CV] END bootstrap=False, max_depth=60, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1600; total time=   7.3s
[CV] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=2000; total time=   6.5s
[CV] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time=   2.0s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   3.1s
[CV] END bootstrap=False, max_depth