In [2]:
from preprocessing import data_from_sql, preprocess_users, preprocess_ascents, merge_user_ascents

In [8]:
df_user, df_ascent, df_grade = data_from_sql("./database.sqlite")
df_user_preprocessed = preprocess_users(df_user)
df_ascents_preprocessed = preprocess_ascents(df_ascent, df_grade)
data = merge_user_ascents(df_user_preprocessed, df_ascents_preprocessed)

In [9]:
data = data[['difference', 'sex', 'height', 'weight', 'usa_boulders']]

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import numpy as np

In [18]:
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data)
X = normalized_data[:, 0:3]
y = np.array(data['usa_boulders'])
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=1729)

In [19]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

LinearRegression()

In [65]:
n = 25 # enough data points to use high 'n' values
weights_strategy = 'uniform' 
k_neighbors_regression = KNeighborsRegressor(n_neighbors=n, weights=weights_strategy)
k_neighbors_regression.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=25)

In [66]:
split = 'best' 
depth = 3 
decision_tree_regression = DecisionTreeRegressor(splitter=split, max_depth=depth, random_state=1729)
decision_tree_regression.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3, random_state=1729)

In [81]:
estimators = 100 
forest_depth = 4

# parameter dict
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_forest_regression = RandomForestRegressor(n_estimators=estimators, max_depth=forest_depth, random_state=1729)
random_forest_regression.fit(X_train, y_train)

RandomForestRegressor(max_depth=4, random_state=1729)

In [89]:
C = 1.0 # default values
epsilon = 0.1

# parameter dict
svr_parameters = [
    {"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [int(x) for x in np.linspace(start=1, stop=1000, num=10)]},
    {"kernel": ["linear"], "C": [int(x) for x in np.linspace(start=1, stop=1000, num=10)]},
]

support_vector_regression = SVR(C=C, epsilon=epsilon)
support_vector_regression.fit(X_train, y_train)

SVR()

In [85]:
from sklearn.model_selection import RandomizedSearchCV
# hyperparameter optimize rf and svr
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=1729, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [90]:
svr = SVR()
svr_random = RandomizedSearchCV(estimator = svr, param_distributions = svr_parameters, n_iter = 24, cv = 3, verbose=2, random_state=1729, n_jobs = -1)
svr_random.fit(X_train, y_train)
svr_random.best_params_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


{'kernel': 'rbf', 'gamma': 0.001, 'C': 223}

In [91]:
from sklearn.metrics import mean_squared_error # metrics for random optimized regressors
random_dict = {'rf': rf_random.best_estimator_, 'svr': svr_random.best_estimator_}
for key in random_dict:
    y_pred = random_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

rf sample predictions:
    predicted: 3.8758373594009448, actual: 6
    predicted: 4.8802837627825175, actual: 4
    predicted: 3.34106647030438, actual: 1
    predicted: 7.502829097689655, actual: 4
    predicted: 7.386215465778793, actual: 10
    predicted: 6.505541462427959, actual: 10
    predicted: 7.578531116081863, actual: 10
    predicted: 7.652796923046638, actual: 7
    predicted: 3.980021610214535, actual: 3
    predicted: 8.211584301784034, actual: 8
rf regression MSE is 4.5006140763822415
svr sample predictions:
    predicted: 4.638947947147173, actual: 6
    predicted: 4.673973604023146, actual: 4
    predicted: 4.288750984457735, actual: 1
    predicted: 7.411388701559517, actual: 4
    predicted: 7.376293513198149, actual: 10
    predicted: 6.934752888834723, actual: 10
    predicted: 7.38801570334077, actual: 10
    predicted: 7.246109113404074, actual: 7
    predicted: 4.665253368457783, actual: 3
    predicted: 8.416481202153065, actual: 8
svr regression MSE is 4.810

In [94]:
from sklearn.model_selection import GridSearchCV

rfgs_params = { # hyperparameter optimize a support vector regressor
    'bootstrap': [True],
    'max_depth': [5, 10, 15, 20],
    'max_features': ['sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [1000, 1300, 1600, 2000]
}

gsrf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=gsrf, param_grid=rfgs_params, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 10, 15, 20],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [1000, 1300, 1600, 2000]},
             verbose=2)

In [95]:
svrgs_params = { # hyperparameter optimize a support vector regressor
    'kernel': ['rbf'],
    'gamma': [0.0001, 0.05, 0.01, 0.09],
    'C': [200, 250, 300, 223]
}

svrgs = SVR()
grid_search_svr = GridSearchCV(estimator=svrgs, param_grid=svrgs_params, cv=3, n_jobs=-1, verbose=2)
grid_search_svr.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


GridSearchCV(cv=3, estimator=SVR(), n_jobs=-1,
             param_grid={'C': [200, 250, 300, 223],
                         'gamma': [0.0001, 0.05, 0.01, 0.09],
                         'kernel': ['rbf']},
             verbose=2)

In [96]:
grid_dict = {'rf': grid_search.best_estimator_, 'svr': grid_search_svr.best_estimator_} # display grid tuned metrics
for key in grid_dict:
    y_pred = grid_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

rf sample predictions:
    predicted: 4.446011027565217, actual: 6
    predicted: 4.500835155586025, actual: 4
    predicted: 3.892237685921148, actual: 1
    predicted: 7.635933714200464, actual: 4
    predicted: 7.615293184473086, actual: 10
    predicted: 6.720584506204307, actual: 10
    predicted: 7.629093821632704, actual: 10
    predicted: 7.630685611256353, actual: 7
    predicted: 4.414210756707087, actual: 3
    predicted: 8.037964933430725, actual: 8
rf regression MSE is 4.4563272754596905
svr sample predictions:
    predicted: 3.944345535873932, actual: 6
    predicted: 4.025102643105637, actual: 4
    predicted: 3.1083347389227605, actual: 1
    predicted: 7.754992825098778, actual: 4
    predicted: 7.635792747432216, actual: 10
    predicted: 6.604596663268122, actual: 10
    predicted: 7.676195006675297, actual: 10
    predicted: 7.442694843455566, actual: 7
    predicted: 4.006351485027633, actual: 3
    predicted: 9.021242369880266, actual: 8
svr regression MSE is 4.56

In [98]:
regressor_dict = {'lr':linear_regression, # display each model's performance on the training set
                  'knn': k_neighbors_regression, 
                  'dtree': decision_tree_regression, 
                  'rforest': random_forest_regression,
                  'svr': support_vector_regression,
                  'grid_rforest': grid_search.best_estimator_,
                  'grid_svr': grid_search_svr.best_estimator_}

for key in regressor_dict:
    y_pred = regressor_dict[key].predict(X_test)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"{key} sample predictions:")
    for k in range(10,20):
        print(f"    predicted: {y_pred[k]}, actual: {y_test[k]}")
    print(f"{key} regression MSE is {MSE}")

lr sample predictions:
    predicted: 5.703629197432921, actual: 6
    predicted: 5.72481435606749, actual: 4
    predicted: 5.651438483765448, actual: 1
    predicted: 7.1447635036981945, actual: 4
    predicted: 7.128874634722268, actual: 10
    predicted: 6.9927154694354705, actual: 10
    predicted: 7.134170924380911, actual: 10
    predicted: 7.097869079689364, actual: 7
    predicted: 5.719518066408847, actual: 3
    predicted: 7.581266813689697, actual: 8
lr regression MSE is 5.064363938287877
knn sample predictions:
    predicted: 4.0, actual: 6
    predicted: 4.4, actual: 4
    predicted: 3.64, actual: 1
    predicted: 7.0, actual: 4
    predicted: 7.2, actual: 10
    predicted: 6.24, actual: 10
    predicted: 8.16, actual: 10
    predicted: 7.72, actual: 7
    predicted: 4.92, actual: 3
    predicted: 8.4, actual: 8
knn regression MSE is 4.61941497273178
dtree sample predictions:
    predicted: 3.782258064516129, actual: 6
    predicted: 3.782258064516129, actual: 4
    predi

[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=1000; total time=   1.5s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1300; total time=   1.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1300; total time=   1.9s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=5, max_featur

[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=1600; total time=   2.3s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1300; total time=   1.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=2000; total time=   2.7s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=1600; total time=   2.4s
[CV] END bootstrap=True, max_depth=5, max_featur

[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=1000; total time=   1.4s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=2, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=3, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=3, min_samples_split=4, n_estimators=1600; total time=   2.2s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1300; total time=   1.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   2.8s
[CV] END bootstrap=True, max_depth=5, max_features=sqrt, min_samples_leaf=4, min_samples_split=3, n_estimators=2000; total time=   3.1s
[CV] END bootstrap=True, max_depth=5, max_featur

In [100]:
import pickle # save models
for key in regressor_dict:
    with open(f'dumped-{key}.pkl', 'wb') as fid:
        pickle.dump(regressor_dict[key], fid)

In [101]:
for key in regressor_dict: # ensure they load back in properly
    with open(f'dumped-{key}.pkl', 'rb') as fid:
        clf = pickle.load(fid)
        y_pred = clf.predict(X_test)
        MSE = mean_squared_error(y_test, y_pred)
        print(f"{key} regression MSE is {MSE}")

lr regression MSE is 5.064363938287877
knn regression MSE is 4.61941497273178
dtree regression MSE is 4.525676289405228
rforest regression MSE is 4.455972940795981
svr regression MSE is 4.556473825620346
grid_rforest regression MSE is 4.4563272754596905
grid_svr regression MSE is 4.5616434513474085
