In [1]:
from sklearn.ensemble import BaggingRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.metrics import roc_auc_score,accuracy_score,r2_score
from sklearn.model_selection import train_test_split, GridSearchCV,KFold,RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')


In [2]:
from sklearn.datasets import fetch_california_housing

In [3]:
cali = fetch_california_housing(as_frame=True)

In [4]:
type(cali)

sklearn.utils._bunch.Bunch

In [5]:
X,y = fetch_california_housing(as_frame=True,return_X_y = True)

In [6]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [7]:
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24, test_size=0.3)

In [12]:
rf = RandomForestRegressor()
params = {'max_features' : [2,3,4,5,6],
         'max_depth':[None,3,5],
         'min_samples_split':[2,10,50],
          'min_samples_leaf': [1,10,50],
           'n_estimators':[20,50] }
kFold = KFold(n_splits=5,shuffle=True,random_state=24)

In [13]:
gcv = GridSearchCV(rf,param_grid= params,cv=kFold ,scoring='r2',verbose=3)

In [22]:
gcv.fit(X,y)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
[CV 1/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.801 total time=   1.1s
[CV 2/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.790 total time=   1.0s
[CV 3/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.811 total time=   1.0s
[CV 4/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.809 total time=   1.1s
[CV 5/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=20;, score=0.813 total time=   1.1s
[CV 1/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.813 total time=   2.6s
[CV 2/5] END max_depth=None, max_features=2, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.805 total time

KeyboardInterrupt: 

In [None]:
print(gcv.best_params_)
print(gcv.best_score_)

### RandomizedSearchCV

RandomizedSearchCV draws a random sample from all the possible parameter sets and then chooses the best parameter condition. 

In [8]:
rf = RandomForestRegressor(random_state=24,oob_score=True)
params = {'max_features' : [2,3,4,5,6],
         'max_depth':[None,3,5],
         'min_samples_split':[2,10,50],
          'min_samples_leaf': [1,10,50],
           'n_estimators':[20,50] }
kFold = KFold(n_splits=5,shuffle=True,random_state=24)

In [9]:
rgcv = RandomizedSearchCV(rf,param_distributions= params,cv=kFold ,scoring='r2',verbose=3,random_state=24)

In [10]:
rgcv.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=5, max_features=3, min_samples_leaf=50, min_samples_split=2, n_estimators=20;, score=0.664 total time=   0.3s
[CV 2/5] END max_depth=5, max_features=3, min_samples_leaf=50, min_samples_split=2, n_estimators=20;, score=0.639 total time=   0.3s
[CV 3/5] END max_depth=5, max_features=3, min_samples_leaf=50, min_samples_split=2, n_estimators=20;, score=0.661 total time=   0.3s
[CV 4/5] END max_depth=5, max_features=3, min_samples_leaf=50, min_samples_split=2, n_estimators=20;, score=0.663 total time=   0.3s
[CV 5/5] END max_depth=5, max_features=3, min_samples_leaf=50, min_samples_split=2, n_estimators=20;, score=0.648 total time=   0.4s
[CV 1/5] END max_depth=3, max_features=3, min_samples_leaf=50, min_samples_split=10, n_estimators=20;, score=0.540 total time=   0.2s
[CV 2/5] END max_depth=3, max_features=3, min_samples_leaf=50, min_samples_split=10, n_estimators=20;, score=0.513 total time=   0.2s
[CV 3/

In [11]:
rgcv.best_score_

0.8172504833002643

In [12]:
rgcv.best_params_

{'n_estimators': 50,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 3,
 'max_depth': None}

### List of param set(Combinations)

In [13]:
list(pd.DataFrame(rgcv.cv_results_)['params'])

[{'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 50,
  'max_features': 3,
  'max_depth': 5},
 {'n_estimators': 20,
  'min_samples_split': 10,
  'min_samples_leaf': 50,
  'max_features': 3,
  'max_depth': 3},
 {'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_features': 6,
  'max_depth': 3},
 {'n_estimators': 20,
  'min_samples_split': 50,
  'min_samples_leaf': 1,
  'max_features': 5,
  'max_depth': 5},
 {'n_estimators': 50,
  'min_samples_split': 10,
  'min_samples_leaf': 10,
  'max_features': 5,
  'max_depth': None},
 {'n_estimators': 50,
  'min_samples_split': 2,
  'min_samples_leaf': 50,
  'max_features': 4,
  'max_depth': 5},
 {'n_estimators': 20,
  'min_samples_split': 2,
  'min_samples_leaf': 50,
  'max_features': 5,
  'max_depth': 5},
 {'n_estimators': 20,
  'min_samples_split': 10,
  'min_samples_leaf': 50,
  'max_features': 3,
  'max_depth': 5},
 {'n_estimators': 50,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'max_fea