<a href="https://colab.research.google.com/github/seolajo/DALC/blob/main/Ensemble_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 앙상블(Ensemble)

## Bagging meta-estimator
    * bagging은 bootstrap aggregating의 줄임말
    * 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
    * 각각의 결과를 결합해 최종 결과를 생성
    * 분산을 줄이고 과적합을 막음
    * 강력하고 복잡한 모델에서 잘 동작

### 필요한 데이터 셋

In [1]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.datasets import load_boston, load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

### 분류 모델

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

## Bagging을 사용한 분류

데이터셋 불러오기

In [4]:
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

## KNN
붓꽃 데이터

In [5]:
# 베이스 모델
# 데이터 모델링하기 하기전에는 반드시 스케일링 과정을 거쳐야함
# StandardScaler() : 스케일러(기본 스케일. 평균과 표준편차 사용)
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier()) # KNN모델 만듬
# 베이스 모델을 사용한 배깅 모델
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5) # Bagging KNN 모델 만듬

#### 분류기에 사용되는 매개변수
* n_estimators : 앙상블에 사용할 분류기의 수
* max_samples : 무작위로 뽑을 샘플의 수(0~1사이의 수로 지정하면 비율이 되어, 훈련세트에 곱한 값만큼 샘플링)
* max_features: 최대 feature의 수


In [6]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model, # 베이스 모델을 기준으로 측정하기에 estimator에 base_model 넣음
    X = iris.data, y=iris.target, # x는 train set, y는 test set
    cv = 5)
# 교차검증을 통해 해당 모델의 정확성, 작동하는데 걸린 시간 등을 알 수 있음
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0012207508087158203 (+/- 0.0005087888989232632)
avg score time: 0.002064800262451172 (+/- 0.0003245179591747825)
avg test score: 0.96 (+/- 0.024944382578492935)


In [7]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01879086494445801 (+/- 0.0009578473415146687)
avg score time: 0.008879470825195312 (+/- 0.002508421454014624)
avg test score: 0.9533333333333334 (+/- 0.02666666666666666)


와인 데이터

In [8]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [9]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0018657684326171876 (+/- 0.000394191233658201)
avg score time: 0.0037934303283691405 (+/- 0.0007565886760297563)
avg test score: 0.9493650793650794 (+/- 0.037910929811115976)


In [10]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.019149637222290038 (+/- 0.0018615540763482273)
avg score time: 0.007653093338012696 (+/- 0.0002504089168421706)
avg test score: 0.943968253968254 (+/- 0.030290439217464302)


유방암 데이터

In [11]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [12]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002376413345336914 (+/- 0.00048266635334684926)
avg score time: 0.009030342102050781 (+/- 0.0026339091879440182)
avg test score: 0.9648501785437045 (+/- 0.009609970350036127)


In [13]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02122673988342285 (+/- 0.0018336854143896511)
avg score time: 0.01530284881591797 (+/- 0.0010609822153585474)
avg test score: 0.9613569321533924 (+/- 0.015257221865679672)


## SVC
붓꽃 데이터

In [14]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [15]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y = iris.target, # x, y에 알맞은 데이터를 넣으세요
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.002080345153808594 (+/- 0.0003773723914966131)
avg score time: 0.0007339954376220703 (+/- 0.00017940085410036332)
avg test score: 0.9666666666666666 (+/- 0.02108185106778919)


In [16]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
# 배깅 모델을 사용해 avg fit time, avg score time, avg test score 구하세요
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.027127170562744142 (+/- 0.0023767878655427625)
avg score time: 0.0030179500579833986 (+/- 0.00012067752745814188)
avg test score: 0.9333333333333332 (+/- 0.059628479399994376)


와인 데이터

In [17]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [18]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y = wine.target, # x, y에 알맞은 데이터를 넣으세요
    cv = 5)
# 베이스 모델을 사용해 avg fit time, avg score time, avg test score 구하세요
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0024512767791748046 (+/- 0.0007476584123443516)
avg score time: 0.0007869243621826172 (+/- 0.0002626643365190286)
avg test score: 0.9833333333333334 (+/- 0.022222222222222233)


In [19]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
# 배깅 모델을 사용해 avg fit time, avg score time, avg test score 구하세요
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.029876279830932616 (+/- 0.004179572838757277)
avg score time: 0.0033967971801757814 (+/- 0.0001564702753976469)
avg test score: 0.9776190476190477 (+/- 0.020831783767013237)


유방암 데이터

In [20]:
base_model = make_pipeline(StandardScaler(),
                          SVC())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [21]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.007138681411743164 (+/- 0.0014546753214245444)
avg score time: 0.001730203628540039 (+/- 0.00036574525952877635)
avg test score: 0.9736376339077782 (+/- 0.014678541667933545)


In [22]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.03673653602600098 (+/- 0.0020433447713278574)
avg score time: 0.008465814590454101 (+/- 0.0019212908816338668)
avg test score: 0.9595714951094549 (+/- 0.008977387313188354)


## Decision Tree
붓꽃 데이터

In [23]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [24]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0021383285522460936 (+/- 0.0010987323498172865)
avg score time: 0.0005402565002441406 (+/- 0.00022373669994183684)
avg test score: 0.9600000000000002 (+/- 0.03265986323710903)


In [25]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.033492755889892575 (+/- 0.005894680024705409)
avg score time: 0.002930021286010742 (+/- 0.0010537683750532249)
avg test score: 0.9466666666666667 (+/- 0.03399346342395189)


와인 데이터

In [26]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [27]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.003222084045410156 (+/- 0.0022195444753828875)
avg score time: 0.0006091594696044922 (+/- 0.0004473677476408026)
avg test score: 0.882063492063492 (+/- 0.0409006687162463)


In [28]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.02639470100402832 (+/- 0.004841333553374879)
avg score time: 0.0022956371307373048 (+/- 1.3144282446078763e-05)
avg test score: 0.938095238095238 (+/- 0.04148600288499907)


유방암 데이터

In [29]:
base_model = make_pipeline(StandardScaler(),
                          DecisionTreeClassifier())
bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [30]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.010301494598388672 (+/- 0.0017823769775063378)
avg score time: 0.0007838726043701172 (+/- 0.00022066730578427683)
avg test score: 0.9226362366092221 (+/- 0.018909928670903447)


In [31]:
# 배깅 모델 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.03782873153686524 (+/- 0.004889295759574268)
avg score time: 0.002758502960205078 (+/- 0.00028188566480759065)
avg test score: 0.9332712311752832 (+/- 0.023844853246203612)


## Bagging을 사용한 회귀
### 데이터셋 불러오기

In [32]:
boston = load_boston()
diabetes = load_diabetes()

## KNN
### 보스턴 주택 가격 데이터

In [33]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [34]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0018446922302246093 (+/- 0.001102035608033209)
avg score time: 0.002072429656982422 (+/- 0.0006479740055097421)
avg test score: 0.47357748833823543 (+/- 0.13243123464477455)


In [35]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.018879508972167967 (+/- 0.0018086020784023444)
avg score time: 0.012230396270751953 (+/- 0.002684078806724339)
avg test score: 0.4745459014769217 (+/- 0.17851070449847778)


### 당뇨병 데이터

In [36]:
base_model = make_pipeline(StandardScaler(),
                          KNeighborsRegressor())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [37]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0018538475036621095 (+/- 0.0006431935793740656)
avg score time: 0.002712869644165039 (+/- 0.0008014978986931604)
avg test score: 0.3689720650295623 (+/- 0.044659049060165365)


In [38]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.023009681701660158 (+/- 0.0069039273040999985)
avg score time: 0.00931253433227539 (+/- 0.00034194512935813506)
avg test score: 0.41134345208895234 (+/- 0.04728474132235157)


## SVR
### 보스턴 주택 가격 데이터

In [39]:
# SVR을 사용해 base_model과 bagging_model을 만드세요
base_model = make_pipeline(StandardScaler(),
                          SVR())
bagging_model=BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [40]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.01750011444091797 (+/- 0.00512478600577649)
avg score time: 0.002388763427734375 (+/- 4.1927627279263275e-05)
avg test score: 0.17631266230186618 (+/- 0.5224914915128981)


In [41]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.05516996383666992 (+/- 0.0032367436359259983)
avg score time: 0.008722972869873048 (+/- 0.0003133100775772373)
avg test score: 0.13798340767626893 (+/- 0.253427596262223)


### 당뇨병 데이터

In [42]:
base_model = make_pipeline(StandardScaler(),
                          SVR())
bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [43]:
# 베이스 모델을 사용
cross_val = cross_validate(
    estimator = base_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.010786914825439453 (+/- 0.0017440746553752545)
avg score time: 0.0019298076629638671 (+/- 7.473658487680405e-05)
avg test score: 0.14659936199629434 (+/- 0.02190798003342928)


In [44]:
# 배깅 모델을 사용
cross_val = cross_validate(
    estimator = bagging_model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.041228866577148436 (+/- 0.0029948629920508885)
avg score time: 0.007476234436035156 (+/- 0.000730949255694142)
avg test score: 0.06397169657654182 (+/- 0.03107081449212264)


## Random Forest
* sklearn.ensemble 모듈에는 무작위 결정 트리를 기반으로하는 두 개의 평균화 알고리즘이 존재
    + Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

## Random Forest 분류

In [46]:
# 랜덤포레스트 모델 만드는 코드를 적으세요
model = make_pipeline(
StandardScaler(),
RandomForestClassifier())

In [47]:
# 랜덤포레스트 모델과 붓꽃 데이터(iris)를 이용해 avg fit time, avg score time, avg test score 구하세요
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.15072031021118165 (+/- 0.017714418503095658)
avg score time: 0.008526420593261719 (+/- 0.0006457059808429199)
avg test score: 0.9666666666666668 (+/- 0.02108185106778919)


In [48]:
# 랜덤포레스트 모델과 와인 데이터(wine)를 이용해 avg fit time, avg score time, avg test score 구하세요
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.13926634788513184 (+/- 0.0023179440533390317)
avg score time: 0.008147192001342774 (+/- 0.00010585477755789996)
avg test score: 0.9777777777777779 (+/- 0.02721655269759088)


In [49]:
# 랜덤포레스트 모델과 유방암 데이터(cancer)를 이용해 avg fit time, avg score time, avg test score 구하세요
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.20904269218444824 (+/- 0.009468847176537317)
avg score time: 0.011718511581420898 (+/- 0.003234545069576563)
avg test score: 0.9666200900481291 (+/- 0.012867010404453715)


## Random Forest 회귀

In [50]:
model = make_pipeline(
StandardScaler(),
RandomForestRegressor())

In [51]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.31942081451416016 (+/- 0.0024534984868021473)
avg score time: 0.008062267303466797 (+/- 0.00025974163453519085)
avg test score: 0.6200729855294366 (+/- 0.2112490681457827)


In [52]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.26297574043273925 (+/- 0.005532609159139478)
avg score time: 0.008093833923339844 (+/- 0.00017167846276766726)
avg test score: 0.4172536564445563 (+/- 0.047710128634865526)


## AdaBoost
* 대표적인 부스팅 알고리즘
* 일련의 약한 모델들을 학습
* 수정된 버전의 데이터를 반복 학습(가중치가 적용된)
* 가중치 투표(또는 합)을 통해 각 모델의 예측 값을 결합
* 첫 단계에서는 원본 데이터를 학습하고 연속적인 반복마다 개별 샘플에 대한 가중치가 수정되고 다시 모델이 학습
    + 잘못 예측된 샘플은 가중치 증가, 올바르게 예측된 샘플은 가중치 감소
    + 각각의 약한 모델들은 예측하기 어려운 샘플에 집중하게 됨

![image.png](attachment:image.png)

In [53]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

## AdaBoost 분류

In [54]:
model = make_pipeline(
StandardScaler(),
AdaBoostClassifier())

In [55]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.0742027759552002 (+/- 0.00976212755663566)
avg score time: 0.006656122207641601 (+/- 0.00012840839547360937)
avg test score: 0.9466666666666667 (+/- 0.03399346342395189)


In [56]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.07975082397460938 (+/- 0.004126938451489134)
avg score time: 0.0069091796875 (+/- 0.0003235805642433975)
avg test score: 0.8085714285714285 (+/- 0.16822356718459935)


In [57]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.1527167797088623 (+/- 0.002044989820938889)
avg score time: 0.00834345817565918 (+/- 0.0012545984530803985)
avg test score: 0.9701133364384411 (+/- 0.019709915473893072)


## AdaBoost 회귀

In [58]:
model = make_pipeline(
StandardScaler(),
AdaBoostRegressor())

In [59]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.10001239776611329 (+/- 0.006485442051864314)
avg score time: 0.004094505310058593 (+/- 0.00012459396352764896)
avg test score: 0.5940988557291916 (+/- 0.2209236978627281)


In [60]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.08783388137817383 (+/- 0.0022239148047999337)
avg score time: 0.0044214725494384766 (+/- 0.0005940702675554197)
avg test score: 0.41751180044046443 (+/- 0.031042796842310146)


## Gradient Tree Boosting
* 임의의 차별화 가능한 손실함수로 일반화한 부스팅 알고리즘
* 웹 검색, 분류 및 회귀 등 다양한 분야에서 모두 사용 가능

In [61]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

## Gradient Tree Boosting 분류

In [62]:
model = make_pipeline(
StandardScaler(),
GradientBoostingClassifier())

In [63]:
cross_val = cross_validate(
    estimator = model,
    X = iris.data, y=iris.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.19951448440551758 (+/- 0.017595515190677716)
avg score time: 0.000997018814086914 (+/- 4.542143630983864e-05)
avg test score: 0.96 (+/- 0.024944382578492935)


In [64]:
cross_val = cross_validate(
    estimator = model,
    X = wine.data, y=wine.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.2365866184234619 (+/- 0.0047326304423223895)
avg score time: 0.0010693073272705078 (+/- 0.000261464210745591)
avg test score: 0.9385714285714286 (+/- 0.032068206474093704)


In [65]:
cross_val = cross_validate(
    estimator = model,
    X = cancer.data, y=cancer.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.4282867431640625 (+/- 0.0071630494687577794)
avg score time: 0.0009654045104980468 (+/- 5.971353121877258e-05)
avg test score: 0.9631268436578171 (+/- 0.02027905296701261)


## Gradient Tree Boosting 회귀

In [66]:
model = make_pipeline(
StandardScaler(),
GradientBoostingRegressor())

In [67]:
cross_val = cross_validate(
    estimator = model,
    X = boston.data, y=boston.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.12183923721313476 (+/- 0.002025959742282059)
avg score time: 0.0009654045104980468 (+/- 3.485298005386255e-05)
avg test score: 0.6748393477170114 (+/- 0.16094566513937658)


In [68]:
cross_val = cross_validate(
    estimator = model,
    X = diabetes.data, y=diabetes.target,
    cv = 5)
print('avg fit time: {} (+/- {})'.format(cross_val['fit_time'].mean(), cross_val['fit_time'].std()))
print('avg score time: {} (+/- {})'.format(cross_val['score_time'].mean(), cross_val['score_time'].std()))
print('avg test score: {} (+/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))

avg fit time: 0.09731192588806152 (+/- 0.005205591985574132)
avg score time: 0.0010747432708740235 (+/- 0.00010599451865250009)
avg test score: 0.4059926887727312 (+/- 0.06886220694482321)


## 투표 기반 모델(Voting Classifier)

* 서로 다른 모델들의 결과를 투표를 통해 결합
* 두가지 방법으로 투표 가능
    + 가장 많이 예측된 클래스를 정답으로 채택(hard voting)
    + 예측된 확률의 가중치 평균(soft voting)

In [69]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings(action='ignore') # 에러 메시지 안 뜨게 하기 위함

## Hard Voting

In [70]:
model1 = SVC()
model2 = GaussianNB()
model3 = RandomForestClassifier()
# 각기 다른 알고리즘을 가진 모델을 이용해 투표 기반 모델 만듬
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)], # ('추정기 이름', 추정기)의 모음
voting = 'hard') # voting 속성으로 hard와 soft 지정


In [71]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))
     

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.96 [RandomForestClassifier]
Accuracy: 0.97 [VotingClassifier]


## Soft Voting

In [73]:
model1 = SVC(probability=True)# 알맞은 모델을 적으시오. # svc모델을 soft voting에 사용하려면 probability=True로 설정해야한다.
model2 = GaussianNB()
model3 = RandomForestClassifier()
vote_model = VotingClassifier(
estimators = [('svc', model1), ('naive', model2), ('forest', model3)],# 알맞은 estimators를 적으세요(hard voting 참고),
voting = 'soft',
weights=[2,1,2]) # weights 가중치 (estimators에 하나씩 해당)

In [74]:
for model in (model1, model2, model3, vote_model):
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, iris.data, iris.target, cv=5)
    print('Accuracy: %0.2f [%s]' %(scores.mean(), model_name))

Accuracy: 0.97 [SVC]
Accuracy: 0.95 [GaussianNB]
Accuracy: 0.95 [RandomForestClassifier]
Accuracy: 0.96 [VotingClassifier]
