## Voting

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

x_tr, x_te, y_tr, y_te = train_test_split(
    cancer['data'], cancer['target'], stratify=cancer['target'],  random_state=0
)
x_tr.shape, x_te.shape

((426, 30), (143, 30))

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [4]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr',lr), 
                         ('dt3',dt3), ('dt5',dt5)])
soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr',lr), 
                         ('dt3',dt3), ('dt5',dt5)], voting = 'soft')

In [5]:
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']

for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    print(f'{name} Train Accuracy: {train_score:.2f}%')
    print(f'{name} Test Accuracy: {test_score:.2f}%')
    print()

hard Train Accuracy: 98.12%
hard Train Accuracy: 95.10%

soft Train Accuracy: 99.53%
soft Train Accuracy: 95.80%

knn1 Train Accuracy: 94.60%
knn1 Train Accuracy: 91.61%

knn2 Train Accuracy: 95.77%
knn2 Train Accuracy: 91.61%

lr Train Accuracy: 96.71%
lr Train Accuracy: 93.71%

dt3 Train Accuracy: 97.65%
dt3 Train Accuracy: 91.61%

dt5 Train Accuracy: 100.00%
dt5 Train Accuracy: 92.31%



## Bagging

In [6]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(1.0, 0.951048951048951)

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(0.9647887323943662, 0.9300699300699301)

In [8]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=1).fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(0.9366197183098591, 0.9230769230769231)

## Boosting

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
model.score(x_tr, y_tr), model.score(x_te, y_te)

(1.0, 0.958041958041958)

## Stacking

In [11]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
              ('gb', GradientBoostingClassifier())]
model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())
model.fit(x_tr, y_tr).score(x_te, y_te)

0.958041958041958

## 손글씨 데이터 비교

In [21]:
from sklearn.datasets import load_digits

digit = load_digits()
y = digit['target']
X = digit['data']

x_tr, x_te, y_tr, y_te = train_test_split(
    X, y, random_state=0, stratify=y, test_size=0.3
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr',lr), 
                         ('dt3',dt3), ('dt5',dt5)])
soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr',lr), 
                         ('dt3',dt3), ('dt5',dt5)], voting = 'soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']

for idx, model in enumerate([hard, soft, knn, lr, dt]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    print(f'{name} Train Accuracy: {train_score:.2f}%')
    print(f'{name} Test Accuracy: {test_score:.2f}%')
    print()

hard Train Accuracy: 99.68%
hard Test Accuracy: 98.52%

soft Train Accuracy: 99.84%
soft Test Accuracy: 98.52%

knn Train Accuracy: 99.20%
knn Test Accuracy: 98.52%

lr Train Accuracy: 100.00%
lr Test Accuracy: 96.67%

dt Train Accuracy: 74.30%
dt Test Accuracy: 72.04%



In [23]:
from sklearn.datasets import load_digits
digits = load_digits()
best_model = {}

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(digits['data'],
                                                    digits['target'],
                                                    stratify=digits['target'],
                                                    random_state=0)

# 모델 설정
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)], voting='soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    best_model[name] = [test_score]
    
# bagging
for i in range(1, 6):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(max_depth=i).fit(x_tr, y_tr)
    best_model[f'bagging, max_depth={i}'] = [model.score(x_te, y_te)]
    
# boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
best_model['boosting'] = [model.score(x_te, y_te)]

# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

best_model['stacking'] = [model.fit(x_tr, y_tr).score(x_te, y_te)]

# 최댓값 저장 후 출력
max_value = max(best_model.values())

for key, value in best_model.items():
    if value == max_value:
        print(f'가장 좋은 모델은 {key}, 성능은 {value:.2f}%')

In [25]:
import pandas as pd
best_model_df = pd.DataFrame(best_model).T
best_model_df

Unnamed: 0,0
hard,98.222222
soft,97.777778
knn1,98.0
knn2,98.666667
lr,96.444444
dt3,46.888889
dt5,67.555556
"bagging, max_depth=1",0.804444
"bagging, max_depth=2",0.82
"bagging, max_depth=3",0.875556


In [40]:
best_model_df.sort_values(0, ascending=False).reset_index()
best_model_df.sort_values(0, ascending=False).reset_index().loc[0,"index"]

'knn2'