In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
iris = load_iris()

#### iris 데이터를 pandas의 dataframe 형태로 변환

In [3]:
df= pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                 columns= iris['feature_names'] + ['target'])

df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,species
0,5.1,3.5,1.4,0.2,0.0,setosa
1,4.9,3.0,1.4,0.2,0.0,setosa
2,4.7,3.2,1.3,0.2,0.0,setosa
3,4.6,3.1,1.5,0.2,0.0,setosa
4,5.0,3.6,1.4,0.2,0.0,setosa


#### 데이터를 target 변수 y와 독립변수 X로 분리한다

In [5]:
x = df.iloc[:,0:4]
y = df.iloc[:,4]

#### 데이터의 80%를 훈련 데이터, 20%를 테스트 데이터로 분리

In [6]:
x_tr, x_ts, y_tr, y_ts = train_test_split(x,y, test_size = 0.2, random_state = 0)

### Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

#### 간단한 랜덤 포레스트 모델 생성

In [8]:
rf_classifier = RandomForestClassifier(random_state = 0)
rf_classifier.fit(x_tr,y_tr)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

#### hyperparameter가 가질 수 있는 값의 후보군을 설정

In [9]:
rf_param_grid = {'n_estimators': [10,20,30,50,100],
              'max_depth': [2,3,4],
              'max_features' : [2,3],
              'min_samples_leaf': [1,2,3,4,5] }

In [10]:
rf_classifier_grid = GridSearchCV(rf_classifier, param_grid = rf_param_grid, 
                       scoring ='accuracy',n_jobs= -1, cv= 5, verbose = 1)

In [11]:
rf_classifier_grid.fit(x_tr, y_tr)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 530 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 750 out of 750 | elapsed:   13.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_dep

#### 최적의 hyperparameter 및 정확도 출력

In [12]:
print("가장 높은 정확도 : {0:.2f}".format(rf_classifier_grid.best_score_))
print("최적의 hyperparamter :",rf_classifier_grid.best_params_)

가장 높은 정확도 : 0.96
최적의 hyperparamter : {'max_depth': 4, 'max_features': 3, 'min_samples_leaf': 2, 'n_estimators': 100}


### Random Search

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

#### 간단한 랜덤포레스트 모델 생성

In [15]:
rf_classifier = RandomForestClassifier(random_state = 0)
rf_classifier.fit(x_tr,y_tr)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

#### hyperparameter 후보군을 설정

In [16]:
rf_param_grid = {'n_estimators': [10,20,30,50,100],
              'max_depth': [2,3,4],
              'max_features' : [2,3],
              'min_samples_leaf': [1,2,3,4,5] }

#### Randomized Search 시행
탐색횟수 : n_iter 지정하고  여기서는 param_grid가 아닌 param_distributions 

In [18]:
rf_classifier_grid = RandomizedSearchCV(rf_classifier, 
                                        param_distributions = rf_param_grid,                        
                                        n_iter = 10,
                                        scoring ='accuracy',
                                        n_jobs= -1, cv= 5, verbose = 1)

In [19]:
rf_classifier_grid.fit(x_tr, y_tr)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.0s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=10,
                                                    n_jobs=None,
  

#### 최적의 hyperparameter 및 정확도 출력

In [20]:
print("가장 높은 정확도 : {0:.2f}".format(rf_classifier_grid.best_score_))
print("최적의 hyperparamter :",rf_classifier_grid.best_params_)

가장 높은 정확도 : 0.95
최적의 hyperparamter : {'n_estimators': 50, 'min_samples_leaf': 5, 'max_features': 3, 'max_depth': 3}
