## Boosting 
- 여러개의 약한 학습기를 순차적으로 학습시켜 예측하면서 잘 못 예측한 데이터에 가중치를 부여하여 오류를 개선해 나가며 학습하는 앙상블 모델 
- 앞에서 학습한 모델 결과를 바탕으로 두 번째 모델에서 오류를 수정하는 방향으로 점진적으로 나가는 앙상블 모델

### Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
data = pd.read_csv("../Data/breast-cancer-wisconsin.csv", encoding="utf-8")
X = data.iloc[:, 1:10]
y = data[['Class']]

In [2]:
from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [3]:
from sklearn.ensemble import * 

model = AdaBoostClassifier(n_estimators=100, random_state=410)
# n_estimator : 모델의 수행횟수 

model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

1.0

In [4]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9649122807017544

In [5]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train, "\n")

con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[333   0]
 [  0 179]] 

[[108   3]
 [  3  57]]


In [6]:
report_train = classification_report(y_train, pred_train)
print(report_train, "\n")

report_test = classification_report(y_test, pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       333
           1       1.00      1.00      1.00       179

    accuracy                           1.00       512
   macro avg       1.00      1.00      1.00       512
weighted avg       1.00      1.00      1.00       512
 

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       111
           1       0.95      0.95      0.95        60

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [11]:
# Gradient Boost 앙상블 모델 적용
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=410, max_depth=1)
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.9765625

In [12]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9707602339181286

In [13]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train, "\n")

con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[325   8]
 [  4 175]] 

[[109   2]
 [  3  57]]


### Regression 

In [14]:
import pandas as pd 
data2 = pd.read_csv("../Data/house_price.csv", encoding="utf-8")
X = data2.iloc[:, 1:5]
y = data2[['house_value']]

In [15]:
from sklearn.model_selection import * 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [16]:
model = AdaBoostRegressor(n_estimators=100, random_state=410)
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.46333184986225207

In [17]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.47739677501438393

In [18]:
import numpy as np 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

print("학습 데이터 RMSE : ", np.sqrt(MSE_train))
print("테스트 데이터 RMSE : ", np.sqrt(MSE_test))

학습 데이터 RMSE :  69500.48653917231
테스트 데이터 RMSE :  70300.43621226118


In [20]:
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=410)
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.6184228916750144

In [21]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.599890459947513

In [22]:
import numpy as np 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

print("학습 데이터 RMSE : ", np.sqrt(MSE_train))
print("테스트 데이터 RMSE : ", np.sqrt(MSE_test))

학습 데이터 RMSE :  58603.842194636214
테스트 데이터 RMSE :  61512.22414392499
