<a href="https://colab.research.google.com/github/skyworld19/ds-school-advanced/blob/master/ML_09_Ensemble_LAB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Ensemble

#### 위스콘신 유방암 진단(이진 분류)

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
# 암 데이터를 가져와 cancer로 이름을 지정한다
cancer = load_breast_cancer()
# 'cancer.csv' 로 저장한다
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['class'] = cancer.target
df.to_csv('cancer.csv', index=False)

target_names = cancer.target_names
print(target_names)

['malignant' 'benign']


In [2]:
import pandas as pd
# 암 데이터를 가져와 cancer로 이름을 지정한다
cancer = pd.read_csv('cancer.csv')
X = cancer.iloc[:, :-1]
y = cancer.iloc[:, -1]
X.shape, y.shape

((569, 30), (569,))

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(X_scaled,
                                                    y,
                                                    stratify=y,
                                                    random_state=0)
[a.shape for a in (x_train, x_test, y_train, y_test)]

[(426, 30), (143, 30), (426,), (143,)]

#### voting 참여 분류기 생성
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

In [14]:
# VotingClassifier의 사용법 확인
from sklearn.ensemble import VotingClassifier
help(VotingClassifier)

Help on class VotingClassifier in module sklearn.ensemble._voting:

class VotingClassifier(sklearn.base.ClassifierMixin, _BaseVoting)
 |  VotingClassifier(estimators, voting='hard', weights=None, n_jobs=None, flatten_transform=True)
 |  
 |  Soft Voting/Majority Rule classifier for unfitted estimators.
 |  
 |  .. versionadded:: 0.17
 |  
 |  Read more in the :ref:`User Guide <voting_classifier>`.
 |  
 |  Parameters
 |  ----------
 |  estimators : list of (str, estimator) tuples
 |      Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
 |      of those original estimators that will be stored in the class attribute
 |      ``self.estimators_``. An estimator can be set to ``'drop'``
 |      using ``set_params``.
 |  
 |      .. deprecated:: 0.22
 |         Using ``None`` to drop an estimator is deprecated in 0.22 and
 |         support will be dropped in 0.24. Use the string ``'drop'`` instead.
 |  
 |  voting : str, {'hard', 'soft'} (default='hard')
 |      If 'ha

In [15]:
# VotingRegressor의 사용법 확인
from sklearn.ensemble import VotingRegressor
VotingRegressor?

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# n_neighbors 를 각각 3, 5로 갖는 KNN 모델 두 개 만들기
# max_iter=10000 인 LogisticRegression 모델 만들기
# max_depth 를 각각 3, 5로 갖는 DecisionTree 모델 두 개 만들기
knn1 = KNeighborsClassifier(3)
knn2 = KNeighborsClassifier(5)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [11]:
from sklearn.ensemble import VotingClassifier

names = ['knn1', 'knn2', 'lr', 'dt3', 'dt5']
estimator = [knn1, knn2, lr, dt3, dt5]
# estimators = [('knn1', knn1), ('knn2', knn2), ...]  와 같이 생성
estimators = list(zip(names, estimator))

#Hard Voting
hard = VotingClassifier(estimators, voting='hard')

#Soft Voting
soft = VotingClassifier(estimators, voting='soft')

In [13]:
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
estimators = [hard, soft, knn1, knn2, lr, dt3, dt5]

# 반복문을 사용한 ensemble 및 단독 모델 성능 비교
for name, model in zip(names, estimators):
    model.fit(x_train, y_train)
    print(f'{name:4s} Train Accuracy {model.score(x_train, y_train)*100 : .2f}%')
    print(f'{name:4s} Test  Accuracy {model.score(x_test, y_test)*100 : .2f}%')

hard Train Accuracy  99.30%
hard Test  Accuracy  96.50%
soft Train Accuracy  99.77%
soft Test  Accuracy  94.41%
knn1 Train Accuracy  98.83%
knn1 Test  Accuracy  95.10%
knn2 Train Accuracy  98.36%
knn2 Test  Accuracy  95.10%
lr   Train Accuracy  99.06%
lr   Test  Accuracy  95.80%
dt3  Train Accuracy  97.65%
dt3  Test  Accuracy  91.61%
dt5  Train Accuracy  100.00%
dt5  Test  Accuracy  90.21%


### 배깅(Bagging) 방식 - RandomForest
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [22]:
from sklearn.ensemble import RandomForestClassifier
# train 데이터를 사용하여 RandomForestClassifier를 학습한다
# max_deptn=5, random_state=0 사용
# train, test 성능을 확인한다
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0).fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(0.9882629107981221, 0.9440559440559441)

### 부스팅(Boosting) 방식 : GradientBoosting

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
# train 데이터를 사용하여 GradientBoostingClassifier를 학습한다
# train, test 성능을 확인한다
model = GradientBoostingClassifier(random_state=0).fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.958041958041958)

### Stacking 방식 : StackingClassifier

In [None]:
StackingClassifier?

In [25]:
from sklearn.ensemble import StackingClassifier

# RandomForestClassifier, GradientBoostingClassifier 를 estimators로 사용하고
# LogisticRegression을 final_estimator로 사용한다
# 이때, estimator 각각에 이름은 'rf', 'gb'로 사용한다
name = ['rf', 'gb']
estimator = [RandomForestClassifier(), GradientBoostingClassifier()]
estimators = list(zip(name, estimator))
final_estimator = LogisticRegression(max_iter=10000, random_state=0)
model = StackingClassifier(estimators, final_estimator).fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.958041958041958)

### XGBOOST(Extream Gradient Boost)

In [26]:
!pip install xgboost



#### XGBOOST 분류 실습
- https://xgboost.readthedocs.io/en/latest/index.html
- https://xgboost.readthedocs.io/en/latest/python/python_api.html

In [27]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [None]:
XGBClassifier?

In [29]:
# 1. 유방암 데이터를 불러온다
# 2. random_state=0 으로 사용하여 데이터를 75%:25% 로 분리한다
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, 
                                                    cancer.target, 
                                                    random_state=0, 
                                                    #stratify=cancer.target -> 안쓰니까 성능이 더 좋게 나옴
                                                    ) 

In [30]:
# 3. XGBClassifier 를 사용하여 model을 만들고 학습시킨 뒤 성능을 확인한다
#   n_estimators=400, learning_rate=0.1, max_depth=3, 
#   use_label_encoder=False (Deprecated) eval_metric='mlogloss' 을 사용한다
model = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='mlogloss')
model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(1.0, 0.986013986013986)

#### XGBOOST 회귀 실습

In [31]:
from sklearn.datasets import load_boston
import pandas as pd
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target
df.to_csv('boston.csv', index=False)

In [32]:
import pandas as pd
boston = pd.read_csv('boston.csv')
X = boston.iloc[:, :-1]
y = boston.iloc[:, -1]
X.shape, y.shape

((506, 13), (506,))

In [34]:
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# random_state=0 으로 사용하여 데이터를 75%:25% 로 분리한다
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0)

# XGBRegressor 를 사용하여 model을 만들고 학습시킨 뒤 성능을 확인한다
#   다음 값을 사용한다
#   n_estimators=1000, learning_rate=0.2, max_depth=3, objective ='reg:squarederror'
model = xgb.XGBRegressor(n_estimators=1000, 
                         learning_rate=0.2, 
                         max_depth=3, 
                         objective ='reg:squarederror')
model.fit(x_train, y_train)
model.score(x_train, y_train), model.score(x_test, y_test)

(0.9999998554210379, 0.7788751773436555)

### lightGBM

In [35]:
!pip install lightgbm



### ligthGBM  분류
- https://lightgbm.readthedocs.io/en/latest/
- https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier

In [40]:
from sklearn.model_selection import train_test_split
import pandas as pd
# 암 데이터를 가져와 분리하여 데이터 준비
cancer =pd.read_csv('cancer.csv')
X = cancer.iloc[:, :-1]
y = cancer.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y,
                                                    random_state=0)

In [46]:
from lightgbm import LGBMClassifier
# LGBMClassifier를 사용하여 학습 후 성능 출력

# n_estimators=100으로 사용한다 (여러 가지로 변경해 볼 것)
model = LGBMClassifier(n_estimators=400).fit(x_train, y_train)
print('train : {:.3f}, test : {:.3f}'.format(model.score(x_train, y_train),
                                             model.score(x_test, y_test)))

train : 1.000, test : 0.951


In [None]:
# XGBClassifier 를 사용하여 model을 만들고 학습시킨 뒤 성능을 확인한다
#   n_estimators=400, learning_rate=0.1, max_depth=3, 
#   use_label_encoder=False (Deprecated) eval_metric='mlogloss' 을 사용한다
model = XGBClassifier(n_estimators=400, 
                      learning_rate=0.05, 
                      max_depth=3,
                      use_label_encoder=False, 
                      eval_metric='mlogloss').fit(X_train, y_train)
print(f'train {model.score(X_train, y_train):.3f}, test {model.score(X_test, y_test):.3f}')

In [47]:
from sklearn.model_selection import train_test_split
import pandas as pd
boston = pd.read_csv('boston.csv')
X = boston.iloc[:, :-1]
y = boston.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0)

In [48]:
from lightgbm import LGBMRegressor
# LGBMRegressor를 사용하여 학습 후 성능 출력

# n_estimators=100로 사용한다 (여러 가지로 변경해 볼 것, 개수, max_depth)
model = LGBMRegressor(n_estimators=100). fit(x_train, y_train)
print('train : {:.3f}, test : {:.3f}'.format(model.score(x_train, y_train),
                                             model.score(x_test, y_test)))

train : 0.976, test : 0.741
