#### Ensemble - RandomForest & ExtraTree
- 배깅 방식의 앙상블 ==> 중복 랜덤 샘플 + 동일 모델(DT)
    * 대표 알고리즘 : RandomForestC/R
- 페이스팅 방식의 앙상블 ==> 랜덤 샘플 + 동일모델(DT), 중복X
    * 대표 알고리즘 : ExtraTreeC/R

[목표] 와인분류 => 0과 1 2개 종류 분류



[1] 모듈 로딩 및 데이터 준비

In [25]:
# 모듈로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
# 파일경로
DATA_FILE = '../DATA/wine.csv'

# CSV ==> DataFrame
wineDF = pd.read_csv(DATA_FILE)

In [27]:
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [28]:
wineDF.head(3)

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0


In [29]:
# 타겟/라벨 클래스 분포
wineDF['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [30]:
wineDF.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


[2] 학습준비

In [31]:
# 학습용 & 테스트용 데이터셋 분할
from sklearn.model_selection import train_test_split

In [32]:
# 피쳐/독립변수와 타겟/라벨/종속변수 분리
featureDF= wineDF[wineDF.columns[:-1]]
targetSR = wineDF[wineDF.columns[-1]]

print(f'featureDF : {featureDF.shape}  targetSR : {targetSR.shape}')

featureDF : (6497, 3)  targetSR : (6497,)


In [33]:
# 학습용, 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    stratify=targetSR,
                                                    test_size=0.2,
                                                    random_state=10)

In [34]:
print(f'X_train: {X_train.shape}, {X_train.ndim}D')
print(f'X_test: {X_test.shape}, {X_test.ndim}D')
print(f'y_train: {y_train.shape}, {y_train.ndim}D')
print(f'y_test: {y_test.shape}, {y_test.ndim}D')

X_train: (5197, 3), 2D
X_test: (1300, 3), 2D
y_train: (5197,), 1D
y_test: (1300,), 1D


[3] 학습 진행

In [35]:
# 학습방법 : 지도학습 ==> 분류
# 알고리즘 : 앙상블 => 배깅 => RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [36]:
# 인스턴스 생성 => 100개의 내부 DT 모델에서 사용할 데이터샛 생성
#                 random_state 매개변수 설정으로 고정된 데이터셋 생성
#                 oob_score 매개변수 : 샘플 데이터셋 추출 후 남은 데이터셋 검증용으로 사용
lf_model = RandomForestClassifier(random_state=10,
                                  oob_score=True)
lf_model.fit(X_train,y_train)

In [37]:
# 모델 파라미터
print('classes_ :',lf_model.classes_)
print('n_classes_ :',lf_model.n_classes_)
print('feature_names_in_ :',lf_model.feature_names_in_)
print('n_features_in_ :',lf_model.n_features_in_)
print('feature_importances_ :',lf_model.feature_importances_)

classes_ : [0. 1.]
n_classes_ : 2
feature_names_in_ : ['alcohol' 'sugar' 'pH']
n_features_in_ : 3
feature_importances_ : [0.23617103 0.49952715 0.26430182]


In [38]:
# 모델 파라미터
print('classes_             :', lf_model.classes_)
for est in lf_model.estimators_: print(est) 
# 100번을 뽑았는 이렇게 모델을 뽑을 수 있다 

classes_             : [0. 1.]
DecisionTreeClassifier(max_features='sqrt', random_state=1165313289)
DecisionTreeClassifier(max_features='sqrt', random_state=1283169405)
DecisionTreeClassifier(max_features='sqrt', random_state=89128932)
DecisionTreeClassifier(max_features='sqrt', random_state=2124247567)
DecisionTreeClassifier(max_features='sqrt', random_state=574014784)
DecisionTreeClassifier(max_features='sqrt', random_state=1902734705)
DecisionTreeClassifier(max_features='sqrt', random_state=1068604539)
DecisionTreeClassifier(max_features='sqrt', random_state=1425548444)
DecisionTreeClassifier(max_features='sqrt', random_state=2141071321)
DecisionTreeClassifier(max_features='sqrt', random_state=357864157)
DecisionTreeClassifier(max_features='sqrt', random_state=965494256)
DecisionTreeClassifier(max_features='sqrt', random_state=108111773)
DecisionTreeClassifier(max_features='sqrt', random_state=850673521)
DecisionTreeClassifier(max_features='sqrt', random_state=898541562)
DecisionTre

In [39]:
print(f'lf_model.oob_score_ :{lf_model.oob_score_}')

lf_model.oob_score_ :0.8993650182797768


[4] 성능평가

In [40]:
train_score = lf_model.score(X_train,y_train)
test_score = lf_model.score(X_test,y_test)

In [41]:
print(f'train_score: {train_score}, test_score: {test_score}')

train_score: 0.9976909755628247, test_score: 0.8930769230769231


[5] 튜닝

- RandomizedSearchCV 하이퍼파라미터 최적화 클래스
    + 범위가 넓은 하이퍼파라미터 설정에 좋음
    + 지정된 범위에서 지정된 횟수 만큼 하이퍼파라미터를 추출하여 조합 진행

In [42]:
# 모듈로딩
from sklearn.model_selection import RandomizedSearchCV

In [43]:
# RandomizedSearchCV 하이퍼파라미터 설정 
params = {'max_depth': range(2,15), # 정수는 range , 실수는 numpy에 arrange 사용
          'min_samples_leaf': range(5,16),
          'criterion': ['gini', 'entropy', 'log_loss']}

In [44]:
rf_model = RandomForestClassifier(random_state=7)

In [48]:
searchCV = RandomizedSearchCV(rf_model,
                            param_distributions=params,
                            n_iter=50,
                            verbose=4)

In [49]:
searchCV.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=entropy, max_depth=2, min_samples_leaf=14;, score=0.754 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=2, min_samples_leaf=14;, score=0.754 total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=2, min_samples_leaf=14;, score=0.755 total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=2, min_samples_leaf=14;, score=0.754 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=2, min_samples_leaf=14;, score=0.754 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=12, min_samples_leaf=8;, score=0.880 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=12, min_samples_leaf=8;, score=0.872 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=12, min_samples_leaf=8;, score=0.876 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=12, min_samples_leaf=8;, score=0.867 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=12, min_samples_leaf=8;, scor

In [50]:
# 모델 파라미터
print(f'[ searchCV.best_params_ ] {searchCV.best_params_}')
print(f'[ searchCV.best_score_ ] {searchCV.best_score_}')
print(f'[ searchCV.best_estimator_ ] {searchCV.best_estimator_}')

cv_resultDF = pd.DataFrame(searchCV.cv_results_)
cv_resultDF
###????????????????????? 점수가 다를수있나??

[ searchCV.best_params_ ] {'min_samples_leaf': 5, 'max_depth': 12, 'criterion': 'log_loss'}
[ searchCV.best_score_ ] 0.8791606204190421
[ searchCV.best_estimator_ ] RandomForestClassifier(criterion='log_loss', max_depth=12, min_samples_leaf=5,
                       random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.167805,0.055055,0.010081,0.00127,14,2,entropy,"{'min_samples_leaf': 14, 'max_depth': 2, 'crit...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,49
1,0.269218,0.013854,0.016255,0.000995,8,12,gini,"{'min_samples_leaf': 8, 'max_depth': 12, 'crit...",0.879808,0.872115,0.875842,0.86718,0.875842,0.874157,0.004253,8
2,0.240869,0.008472,0.014318,0.001238,7,7,entropy,"{'min_samples_leaf': 7, 'max_depth': 7, 'crite...",0.869231,0.863462,0.865255,0.856593,0.872955,0.865499,0.005533,30
3,0.173297,0.008965,0.011955,0.001369,14,4,gini,"{'min_samples_leaf': 14, 'max_depth': 4, 'crit...",0.858654,0.841346,0.844081,0.843118,0.844081,0.846256,0.006279,42
4,0.275554,0.006125,0.016477,0.001246,6,10,gini,"{'min_samples_leaf': 6, 'max_depth': 10, 'crit...",0.879808,0.870192,0.873917,0.865255,0.879692,0.873773,0.005601,9
5,0.200957,0.009424,0.014561,0.001413,10,5,gini,"{'min_samples_leaf': 10, 'max_depth': 5, 'crit...",0.870192,0.851923,0.864293,0.860443,0.857555,0.860881,0.006162,36
6,0.258725,0.026102,0.014687,0.001902,5,7,log_loss,"{'min_samples_leaf': 5, 'max_depth': 7, 'crite...",0.873077,0.866346,0.860443,0.854668,0.868142,0.864535,0.006378,33
7,0.312114,0.037496,0.016425,0.001765,7,11,log_loss,"{'min_samples_leaf': 7, 'max_depth': 11, 'crit...",0.877885,0.868269,0.876805,0.866218,0.885467,0.874929,0.006979,6
8,0.167121,0.011903,0.011932,0.00158,11,3,gini,"{'min_samples_leaf': 11, 'max_depth': 3, 'crit...",0.832692,0.815385,0.801732,0.823869,0.829644,0.820664,0.011152,47
9,0.20228,0.00888,0.013532,0.001978,7,5,entropy,"{'min_samples_leaf': 7, 'max_depth': 5, 'crite...",0.866346,0.857692,0.85948,0.856593,0.858518,0.859726,0.003444,38
