## 앙상블(Ensemble) 학습

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
from sklearn.preprocessing import MinMaxScaler
cancer_scaled = MinMaxScaler().fit_transform(cancer.data)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target, random_state=2021
)

### 1. 앙상블 학습 - Hard Voting
- 로지스틱 회귀
- 서포트벡터머신(SVC)
- K 최근접 이웃

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
lrc = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [6]:
from sklearn.ensemble import VotingClassifier

voc = VotingClassifier(
    estimators=[('LRC',lrc),('SVC',svc),('KNN',knn)], voting ='hard'
)

In [8]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.986013986013986

- 개별 분류기의 성능

In [9]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.986013986013986, 0.986013986013986, 0.986013986013986)

### 2. 앙상블 학습 - 소프트 보팅

In [10]:
# 메소드와 속성
print(dir(lrc))

['C', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_feature_names', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_predict_proba_lr', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', 'class_weight', 'classes_', 'coef_', 'decision_function', 'densify', 'dual', 'fit', 'fit_intercept', 'get_params', 'intercept_', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_features_in_', 'n_iter_', 'n_jobs', 'penalty', 'predict', 'predict_log_proba', 'predict_proba', 'random_state', 'score', 'set_params', 'solver', 'sparsify', 'tol', 'verbose', 'warm_start']


In [11]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [14]:
svc2 = SVC(probability=True)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[9.99922782e-01, 7.72181899e-05],
       [9.99943681e-01, 5.63189493e-05],
       [3.34079882e-06, 9.99996659e-01],
       [9.45347465e-01, 5.46525353e-02],
       [3.47125094e-02, 9.65287491e-01]])

In [15]:
knn.predict_proba(X_test[:5])

array([[1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.8, 0.2],
       [0. , 1. ]])

In [29]:
voc2 = VotingClassifier(
    estimators=[('LRC',lrc),('SVC',svc2),('KNN',knn)], voting ='soft'
)

In [30]:
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

0.993006993006993

In [31]:
voc2.predict_proba(X_test[:5])

array([[0.99529561, 0.00470439],
       [0.99436662, 0.00563338],
       [0.00747534, 0.99252466],
       [0.804216  , 0.195784  ],
       [0.05812803, 0.94187197]])

- GridSearchCV 적용

In [35]:
params = {
    'LRC__C': [0.01,0.05, 0.1, 0.2, 0.5],
    'SVC__C': [5,10,20,50]
}

In [36]:
from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(voc2, param_grid=params, scoring='accuracy', cv=3)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 0.05, 'SVC__C': 50}

In [37]:
grid_voc2.best_estimator_.score(X_test, y_test)

0.993006993006993

In [38]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2021)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [39]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9790209790209791