In [1]:
#import file from drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import joblib
import pandas as pd
import numpy as np
from time import time

x_train=pd.read_csv("drive/Shareddrives/major_project/data/x_train.csv")
y_train=pd.read_csv("drive/Shareddrives/major_project/data/y_train.csv")

In [3]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [4]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 130, num = 12)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 102, 105, 108, 110, 113, 116, 119, 121, 124, 127, 130], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_leaf': [1, 2, 5, 10]}


In [5]:
def print_res(results):
  print("Best Params: {}\n".format(results.best_params_))
  means=results.cv_results_['mean_test_score']
  stds=results.cv_results_['std_test_score']
  for mean,std,params in zip(means,stds,results.cv_results_['params']):
    print('{} (+/-{}) for {}'.format(round(mean,3),round(std*2,3),params))

In [6]:
# Random search of parameters
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid)
start=time()
rf_random.fit(x_train,y_train.values.ravel())
end=time()
print_res(rf_random)
print("Time Elapsed : {}s".format(round(end-start,3)))

Best Params: {'n_estimators': 124, 'min_samples_leaf': 1, 'max_depth': 25}

0.92 (+/-0.045) for {'n_estimators': 110, 'min_samples_leaf': 10, 'max_depth': 5}
0.955 (+/-0.036) for {'n_estimators': 105, 'min_samples_leaf': 1, 'max_depth': 30}
0.941 (+/-0.041) for {'n_estimators': 130, 'min_samples_leaf': 5, 'max_depth': 10}
0.934 (+/-0.045) for {'n_estimators': 130, 'min_samples_leaf': 10, 'max_depth': 25}
0.955 (+/-0.034) for {'n_estimators': 130, 'min_samples_leaf': 1, 'max_depth': 25}
0.956 (+/-0.037) for {'n_estimators': 124, 'min_samples_leaf': 1, 'max_depth': 25}
0.933 (+/-0.048) for {'n_estimators': 119, 'min_samples_leaf': 10, 'max_depth': 10}
0.932 (+/-0.034) for {'n_estimators': 113, 'min_samples_leaf': 1, 'max_depth': 5}
0.935 (+/-0.044) for {'n_estimators': 127, 'min_samples_leaf': 10, 'max_depth': 20}
0.955 (+/-0.038) for {'n_estimators': 108, 'min_samples_leaf': 1, 'max_depth': 15}
Time Elapsed : 59.512s


In [7]:
rfr_model=rf_random.best_estimator_
print(rfr_model)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=25, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=124, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


In [8]:
joblib.dump(rfr_model,"drive/Shareddrives/major_project/data/rfRandnew_model.pkl")

['drive/Shareddrives/major_project/data/rfRandnew_model.pkl']