# Stacking
- 개별 알고리즘으로 예측한 데이터를 기반으로 다시 예측을 수행한다는 것
- 개별 알고리즘의 예측 결과 데이터 세트를 최종적인 메타 데이터 세트로 만들어 별도의 ML알고리즘으로 최종 학습.
- 반드시 성능 향상의 보장은 없지만, 일반적으로 성능이 비슷한 모델을 결합해 좀 더 나은 성능 향상을 도출

# cacner ex

In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
cancer = load_breast_cancer()

X_data = cancer.data
y_label = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X_data, y_label, test_size = 0.2, random_state = 0)

## 개별 모델

In [34]:
knn = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(n_estimators=100)

#final model
lr_final = LogisticRegression(C = 10, n_jobs=8)

In [6]:
knn.fit(X_train, y_train)
rf.fit(X_train, y_train)
dt.fit(X_train, y_train)
ada.fit(X_train, y_train)

knn_pred = knn.predict(X_test)
rf_pred = rf.predict(X_test)
dt_pred = dt.predict(X_test)
ada_pred = ada.predict(X_test)

print('KNN accuracy : {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('rf accuracy : {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('dt accuracy : {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('ada accuracy : {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

KNN accuracy : 0.9211
rf accuracy : 0.9649
dt accuracy : 0.9123
ada accuracy : 0.9561


## final pred

In [35]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)

pred = np.transpose(pred)
print(pred.shape)

(4, 114)
(114, 4)


In [36]:
#????
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

print('final model accuracy : {0:.4f}'.format(accuracy_score(y_test, final)))

final model accuracy : 0.9737


  " = {}.".format(effective_n_jobs(self.n_jobs)))


# CV stacking

## step1
각 모델별로 원본 학습/테스트 데이터를 예측한 결고값을 기반으로 메타 모델을 위한 train, test 데이터 생성

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [37]:
def get_stacking_base_dataset(model, X_train_n, y_train_n, X_test_n, n_folds) :
    
    kf = KFold(n_splits=n_folds, shuffle = False, random_state = 0)
    
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    print(model.__class__.__name__, 'model start')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)) :
        print('\t fold set : ', folder_counter, 'start')
        
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]
        
        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        test_pred[:, folder_counter] = model.predict(X_test_n)
        
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [38]:
print('X_train shape : ', X_train.shape)
print('X_test shape : ', X_test.shape)

X_train shape :  (455, 30)
X_test shape :  (114, 30)


In [39]:
knn_train, knn_test = get_stacking_base_dataset(knn, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_dataset(rf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_dataset(dt, X_train, y_train, X_test, 7)
ada_train, ada_test = get_stacking_base_dataset(ada, X_train, y_train, X_test, 7)

KNeighborsClassifier model start
	 fold set :  0 start
	 fold set :  1 start
	 fold set :  2 start
	 fold set :  3 start
	 fold set :  4 start
	 fold set :  5 start
	 fold set :  6 start
RandomForestClassifier model start
	 fold set :  0 start
	 fold set :  1 start
	 fold set :  2 start
	 fold set :  3 start
	 fold set :  4 start
	 fold set :  5 start
	 fold set :  6 start
DecisionTreeClassifier model start
	 fold set :  0 start
	 fold set :  1 start
	 fold set :  2 start
	 fold set :  3 start
	 fold set :  4 start
	 fold set :  5 start
	 fold set :  6 start
AdaBoostClassifier model start
	 fold set :  0 start
	 fold set :  1 start
	 fold set :  2 start
	 fold set :  3 start
	 fold set :  4 start
	 fold set :  5 start
	 fold set :  6 start


In [40]:
print(knn_train.shape)
print(knn_test.shape)

(455, 1)
(114, 1)


## step2
step1에서 개별 모델들이 생성한 학습용 데이터와 테스트용 데이터를 모두 스태킹 형태로 합친다.  메타 모델은 최종적으로 생성된 학습 데이터 세트와 원본 학습 데이터의 레이블 데이터를 기반으로 학습한 뒤, 최종적으로 생성된 테스트 데이터 세트를 예측하고, 원본 테스트 데이터의 레이블 데이터를 기반으로 평가

In [41]:
Stack_final_X_train = np.concatenate([knn_train, rf_train, dt_train, ada_train], axis = 1)
Stack_final_X_test = np.concatenate([knn_test, rf_test, dt_test, ada_test], axis = 1)

print('original train features shape : ', X_train.shape, 'original test features shape', X_test.shape)
print('stacking train features shape : ', Stack_final_X_train.shape, 'stacking test features shape', Stack_final_X_test.shape)

original train features shape :  (455, 30) original test features shape (114, 30)
stacking train features shape :  (455, 4) stacking test features shape (114, 4)


In [42]:
lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

print('final accuracy : {0:.4f}'.format(accuracy_score(y_test, stack_final)))

final accuracy : 0.9737


  " = {}.".format(effective_n_jobs(self.n_jobs)))
