In [2]:
import pandas as pd
import numpy as np

In [3]:
RANDOM_SEED = 65

np.random.seed(RANDOM_SEED)

In [4]:
def get_data_item(name: str) -> pd.DataFrame:
    df = pd.read_csv(f"./{name}")
    if df.columns[0] == 'Unnamed: 0':
        print('Removing first column')
        df.drop(columns=df.columns[0], axis=1, inplace=True)
    return df

In [5]:
X_train = get_data_item('X_train-holds-binary.csv')
y_train = get_data_item('y_train-holds-binary.csv')

FileNotFoundError: [Errno 2] No such file or directory: './X_train-holds-binary.csv'

In [None]:
X_train.head()

Unnamed: 0,angle,quality_average,ascensionist_count,hold_0,hold_1,hold_2,hold_3,hold_4,hold_5,hold_6,...,hold_1460,hold_1461,hold_1462,hold_1463,hold_1464,hold_1465,hold_1466,hold_1467,hold_1468,hold_1469
0,30,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,30,2.6,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,25,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y_train.head()

Unnamed: 0,difficulty_average
0,18.0
1,18.8
2,22.0
3,24.0
4,17.0


In [None]:
X = X_train.values
y = y_train.values.ravel()

In [None]:
X.shape, y.shape

((25048, 1473), (25048,))

## Dummy Regressor

In [None]:
from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
dummy_regr.score(X, y) 

0.0

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rgr = RandomForestRegressor(random_state=RANDOM_SEED)

In [None]:
def get_random_percentage_of_data(X, y, percentage):
    num_samples = int(len(X) * percentage)
    indices = np.random.choice(len(X), num_samples, replace=False)
    return X[indices], y[indices]

In [None]:
param_grid = {
    'n_estimators': [1000],
    # 'max_features': ['sqrt', 'log2', None], # was None
    'max_depth': [25],
    # 'max_leaf_nodes': [80, None], # was None
}

unique_combinations = np.prod([len(param_grid[key]) for key in param_grid.keys()])
print('Unique combinations: ', unique_combinations)

Unique combinations:  1


In [None]:
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score

mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

scoring = {'MSE': mse_scorer, 'MAE': mae_scorer, 'R2': r2_scorer}

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

grid_search = GridSearchCV(rgr, param_grid=param_grid, cv=5, scoring=scoring, refit='MSE')
grid_search.fit(X, y)

In [None]:
print(f"Best params: {grid_search.best_params_}")
print(f"Best MSE score: {-grid_search.cv_results_['mean_test_MSE'][grid_search.best_index_]}")
print(f"Best MAE score: {-grid_search.cv_results_['mean_test_MAE'][grid_search.best_index_]}")
print(f"Best R2 score: {grid_search.cv_results_['mean_test_R2'][grid_search.best_index_]}")

Best params: {'max_depth': 25, 'n_estimators': 1000}
Best MSE score: 7.829001019405463
Best MAE score: 2.1041027894109963
Best R2 score: 0.5263655397433864


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3d14fd08-487f-450a-b532-6cfd474a7f71' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>