# 앙상블

## 보팅 voting

In [2]:
# voting 실습

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# 데이터 불러오기
cancer = load_breast_cancer()

y = cancer['target']
x = cancer['data']

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(
    x,
    y,
    stratify=cancer['target'],
    random_state=0
)

# voting 참여 분류기 생성
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)                           
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting 모델 생성
hard = VotingClassifier([('knn1',knn1),('knn2',knn2),('lr',lr),('dt3',dt3),('dt5',dt5)])
soft = VotingClassifier([('knn1',knn1),('knn2',knn2),('lr',lr),('dt3',dt3),('dt5',dt5)], voting='soft')

# voting 분류기 성능 평가
names = ['hard','soft','knn1','knn2','lr','dt3','dt5']

for idx, model in enumerate([hard,soft,knn1,knn2,lr,dt3,dt5]):
    model.fit(x_tr,y_tr)
    name = names[idx]
    tr_score = model.score(x_tr,y_tr) * 100
    te_score = model.score(x_te,y_te) * 100
    print(f'{name} Train Accuracy : {tr_score:.2f}%')
    print(f'{name} Test Accuracy : {te_score:.2f}%')
    print()

hard Train Accuracy : 98.12%
hard Test Accuracy : 95.10%

soft Train Accuracy : 99.53%
soft Test Accuracy : 95.80%

knn1 Train Accuracy : 94.60%
knn1 Test Accuracy : 91.61%

knn2 Train Accuracy : 95.77%
knn2 Test Accuracy : 91.61%

lr Train Accuracy : 96.71%
lr Test Accuracy : 93.71%

dt3 Train Accuracy : 97.65%
dt3 Test Accuracy : 93.01%

dt5 Train Accuracy : 100.00%
dt5 Test Accuracy : 92.31%



## 배깅 bagging
대표적 : Random Forest  
장점 : 과적합 줄임. 스케일 조절 필요 없음, 결정 트리 모델 예측 성능 유지, 매개변수의 튜닝을 많이 하지 않아도 됨.  
단점 : 결정트리 많이 필요, 대량 데이터 셋이라면 다소 시간 걸림, 텍스트 데이터 등 희소&고차원 데이터(one-hot)엔 작동 잘 안됨.  
랜덤으로 결정트리를 많이 많들어 가장 많이 나온 답을 선택 => 랜덤+포레스트(트리 * 많이) 결정트리 집단지성ㅋㅋ  
n_estimators : 결정트리의 갯수  
max_features : 선택할 무작위 특성의 개수, 핵심변수, 기본값을 권장.  
hyperParametor : 모델링할 때 사용자가 직접 세팅해주는 값==> 성능에 영향  

In [11]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_tr,y_tr)
model.score(x_tr,y_tr), model.score(x_te,y_te)

(1.0, 0.958041958041958)

In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=4).fit(x_tr,y_tr)
model.score(x_tr,y_tr), model.score(x_te,y_te)

(0.9976525821596244, 0.951048951048951)

In [16]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=3).fit(x_tr,y_tr)
model.score(x_tr,y_tr), model.score(x_te,y_te)

(0.9835680751173709, 0.9300699300699301)

In [17]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2).fit(x_tr,y_tr)
model.score(x_tr,y_tr), model.score(x_te,y_te)

(0.9624413145539906, 0.9230769230769231)

## Boosting

여러개의 결정트리를 묶어 강력한 모델을 만드는 방법  
뎁스가 얕은 트리를 많이 연결(1~5 뎁)  
이전 트리 오차 보완하는 형식  

In [18]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr,y_tr)
model.score(x_tr,y_tr), model.score(x_te,y_te)

(1.0, 0.951048951048951)

In [19]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf',RandomForestClassifier()),('gb',GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
model.fit(x_tr,y_tr).score(x_te,y_te)

0.958041958041958

In [29]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

# 데이터 세트 준비

digits = load_digits()
y = digits['target']
x = digits['data']

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(
    x,
    y,
    stratify=digits['target'],
    random_state=0,
    test_size=0.3
)

# 모델 평가(가장 좋은 Classfication 모델 찾기)

# voting 참여 분류기 생성
knn = KNeighborsClassifier(n_neighbors=5)
lr = LogisticRegression(max_iter=10000)                           
dt = DecisionTreeClassifier(max_depth=3)
rf = RandomForestClassifier(max_depth=5)
gb = GradientBoostingClassifier()
estimators = [('rf',RandomForestClassifier()),('gb',GradientBoostingClassifier())]
sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())


# voting 모델 생성
hard = VotingClassifier([('knn',knn),('lr',lr),('dt',dt),('gb',gb),('rf',rf),('sc',sc)])
soft = VotingClassifier([('knn',knn),('lr',lr),('dt',dt),('gb',gb),('rf',rf),('sc',sc)], voting='soft')

# voting 분류기 성능 평가
names = ['hard','soft','knn1','knn2','lr','dt3','dt5','gb','rf','sc']

max_score=0
best_model = ''

for idx, model in enumerate([hard,soft,knn,lr,dt,gb,rf,sc]):
    
    model.fit(x_tr,y_tr)
    name = names[idx]
    tr_score = model.score(x_tr,y_tr) * 100
    te_score = model.score(x_te,y_te) * 100
    print(f'{name} Train Accuracy : {tr_score:.2f}%')
    print(f'{name} Test Accuracy : {te_score:.2f}%')
    
    if te_score > max_score:
        max_score = te_score
        best_model = str(name)
    print()
    
print(f'best model is {best_model}')
print(f'Test Accuracy : {max_score:.2f}%')

hard Train Accuracy : 100.00%
hard Test Accuracy : 97.59%

soft Train Accuracy : 100.00%
soft Test Accuracy : 98.15%

knn1 Train Accuracy : 99.20%
knn1 Test Accuracy : 97.96%

knn2 Train Accuracy : 100.00%
knn2 Test Accuracy : 96.67%

lr Train Accuracy : 47.18%
lr Test Accuracy : 47.59%

dt3 Train Accuracy : 100.00%
dt3 Test Accuracy : 96.30%

dt5 Train Accuracy : 97.22%
dt5 Test Accuracy : 94.81%

gb Train Accuracy : 100.00%
gb Test Accuracy : 97.04%

best model is soft
Test Accuracy : 98.15%


In [31]:
from sklearn.datasets import load_digits
digits = load_digits()
best_model = {}

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(digits['data'],digits['target'],stratify=digits['target'],random_state=0)

# 모델 설정
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)], voting='soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    best_model[name] = [test_score]
    
# bagging
for i in range(1, 6):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(max_depth=i).fit(x_tr, y_tr)
    best_model[f'bagging, max_depth={i}'] = [model.score(x_te, y_te)]
    
# boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
best_model['boosting'] = [model.score(x_te, y_te)]

# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

best_model['stacking'] = [model.fit(x_tr, y_tr).score(x_te, y_te)]

import pandas as pd
pd.DataFrame(best_model).T

Unnamed: 0,0
hard,98.222222
soft,97.777778
knn1,98.0
knn2,98.666667
lr,96.444444
dt3,46.888889
dt5,67.555556
"bagging, max_depth=1",0.695556
"bagging, max_depth=2",0.826667
"bagging, max_depth=3",0.868889


In [40]:
a = pd.DataFrame(best_model).T.sort_values(by = 0, ascending=False).reset_index().iloc[0,0]
b = pd.DataFrame(best_model).T.sort_values(by = 0, ascending=False).reset_index().iloc[0,1]
print(f'가장 좋은 성능을 가진 모델은 {a}, {b:.2f}% 입니다')

가장 좋은 성능을 가진 모델은 knn2, 98.67% 입니다


In [None]:
#조익준

from sklearn.datasets import load_digits
digits = load_digits()
best_model = {}

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(digits['data'],
                                                    digits['target'],
                                                    stratify=digits['target'],
                                                    random_state=0)

# 모델 설정
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)], voting='soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    best_model[name] = [test_score]
    
# bagging
for i in range(1, 6):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(max_depth=i).fit(x_tr, y_tr)
    best_model[f'bagging, max_depth={i}'] = [model.score(x_te, y_te)]
    
# boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
best_model['boosting'] = [model.score(x_te, y_te)]

# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

best_model['stacking'] = [model.fit(x_tr, y_tr).score(x_te, y_te)]

import pandas as pd
best_model_df = pd.DataFrame(best_model).T
print(best_model_df)
print(best_model_df.sort_values(0, ascending=False).reset_index())
best_model_df.sort_values(0, ascending=False).reset_index().loc[0,"index"]