In [69]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from joblib import dump, load
import pickle

In [None]:
def evaluate(model, X_test, Y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - Y_test)
    mape = 100 * np.mean(errors / Y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [2]:
df = pd.read_csv('challenge1-train-1.csv', index_col=0)
X = df.drop(columns=['y'])
Y = df['y']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [30]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# auto and sqrt do not seem as good as log2
max_features = ['log2']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [38]:
m = RandomForestRegressor()
m_random = RandomizedSearchCV(estimator = m, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
m_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [39]:
m_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 40,
 'bootstrap': False}

In [41]:
m_base = RandomForestRegressor()
m_base.fit(X_train, Y_train)
base_accuracy = evaluate(m_base, X_test, Y_test)

best_random = m_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, Y_test)

Model Performance
Average Error: 1.2709 degrees.
Accuracy = 97.48%.
Model Performance
Average Error: 1.1807 degrees.
Accuracy = 97.57%.


In [44]:
param_grid = {'n_estimators': [350, 400, 450, 1950, 2000, 2050],
               'max_features': ['log2'],
               'max_depth': [None, 40],
               'min_samples_split': [1, 2, 3],
               'min_samples_leaf': [1, 2],
               'bootstrap': [False]}

In [51]:
grid_search = GridSearchCV(estimator = m, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


72 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Tanks\anaconda3\envs\MSCS-basic\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Tanks\anaconda3\envs\MSCS-basic\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "C:\Users\Tanks\anaconda3\envs\MSCS-basic\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\Tanks\anaconda3\envs\MSCS-basic\lib\site-packages\joblib\parallel.py", line 861, in dispatch_

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'max_depth': [None, 40],
                         'max_features': ['log2'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [1, 2, 3],
                         'n_estimators': [350, 400, 450, 1950, 2000, 2050]},
             verbose=2)

In [52]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 350}

In [53]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, Y_test)

Model Performance
Average Error: 1.1919 degrees.
Accuracy = 97.54%.


In [55]:
param_grid1 = {'n_estimators': [300, 325, 350, 375, 400],
               'max_features': ['log2'],
               'max_depth': [35, 40, 45],
               'min_samples_split': [2],
               'min_samples_leaf': [1],
               'bootstrap': [False]}

In [56]:
grid_search1 = GridSearchCV(estimator = m, param_grid = param_grid1, cv = 3, n_jobs = -1, verbose = 2)
grid_search1.fit(X_train, Y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'max_depth': [35, 40, 45],
                         'max_features': ['log2'], 'min_samples_leaf': [1],
                         'min_samples_split': [2],
                         'n_estimators': [300, 325, 350, 375, 400]},
             verbose=2)

In [57]:
grid_search1.best_params_

{'bootstrap': False,
 'max_depth': 45,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 325}

In [58]:
best_grid1 = grid_search1.best_estimator_
grid_accuracy1 = evaluate(best_grid1, X_test, Y_test)

Model Performance
Average Error: 1.1690 degrees.
Accuracy = 97.59%.


In [59]:
param_grid12 = {'n_estimators': [310, 320, 325, 330, 340],
               'max_features': ['log2'],
               'max_depth': [45, 50, 60, None],
               'min_samples_split': [2],
               'min_samples_leaf': [1],
               'bootstrap': [False]}

In [60]:
grid_search2 = GridSearchCV(estimator = m, param_grid = param_grid1, cv = 3, n_jobs = -1, verbose = 2)
grid_search2.fit(X_train, Y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [False], 'max_depth': [35, 40, 45],
                         'max_features': ['log2'], 'min_samples_leaf': [1],
                         'min_samples_split': [2],
                         'n_estimators': [300, 325, 350, 375, 400]},
             verbose=2)

In [61]:
grid_search2.best_params_

{'bootstrap': False,
 'max_depth': 40,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 375}

In [62]:
best_grid2 = grid_search2.best_estimator_
grid_accuracy2 = evaluate(best_grid2, X_test, Y_test)

Model Performance
Average Error: 1.1865 degrees.
Accuracy = 97.57%.


In [65]:
dump(best_grid1, '22-03-09.joblib')

['22-03-09.joblib']

In [67]:
with open('22-03-09.pckl', 'wb') as f:
  pickle.dump(best_grid1, f)

In [81]:
cvs1 = cross_val_score(best_grid, X_test, Y_test, cv=5)
cvs2 = cross_val_score(best_grid1, X_test, Y_test, cv=5)
cvs3 = cross_val_score(best_grid2, X_test, Y_test, cv=5)
cvs4 = cross_val_score(m, X_test, Y_test, cv=5)

print(f'{cvs1.min()=}, {cvs2.min()=}, {cvs3.min()=}, {cvs4.min()=}')

cvs1.min()=0.9602037659810284, cvs2.min()=0.96134940657797, cvs3.min()=0.9602752064614002, cvs4.min()=0.9570404674282698
