## Stacking 
- 여러 학습기에서 예측한 예측값으로 다시 학습 데이터를 만들어 일반화된 최종 모델을 구성하는 방법 

### Classification

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
data = pd.read_csv("../Data/breast-cancer-wisconsin.csv")
X = data.iloc[:, 1:10]
y = data[['Class']]

In [2]:
from sklearn.model_selection import *
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 410)

from sklearn.preprocessing import * 
minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [12]:
from sklearn.ensemble import * 
import sklearn
from sklearn.svm import * 
from sklearn.linear_model import * 

In [11]:
dir(sklearn.ensemble)

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gb_losses',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

In [14]:
help(StackingClassifier)

Help on class StackingClassifier in module sklearn.ensemble._stacking:

class StackingClassifier(sklearn.base.ClassifierMixin, _BaseStacking)
 |  StackingClassifier(estimators, final_estimator=None, *, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0)
 |  
 |  Stack of estimators with a final classifier.
 |  
 |  Stacked generalization consists in stacking the output of individual
 |  estimator and use a classifier to compute the final prediction. Stacking
 |  allows to use the strength of each individual estimator by using their
 |  output as input of a final estimator.
 |  
 |  Note that `estimators_` are fitted on the full `X` while `final_estimator_`
 |  is trained using cross-validated predictions of the base estimators using
 |  `cross_val_predict`.
 |  
 |  Read more in the :ref:`User Guide <stacking>`.
 |  
 |  .. versionadded:: 0.22
 |  
 |  Parameters
 |  ----------
 |  estimators : list of (str, estimator)
 |      Base estimators which will be stacked 

In [15]:
estimators = [("rf", RandomForestClassifier(n_estimators=10, random_state = 410)),
              ("svr", SVC(random_state=410))]
model = StackingClassifier(estimators= estimators, final_estimator=LogisticRegression())
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.9765625

In [17]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9766081871345029

In [18]:
from sklearn.metrics import * 
con_train = confusion_matrix(y_train, pred_train)
print(con_train, "\n")

con_test = confusion_matrix(y_test, pred_test)
print(con_test)

[[325   8]
 [  4 175]] 

[[107   4]
 [  0  60]]


In [20]:
report_train = classification_report(y_train, pred_train)
print(report_train)

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       333
           1       0.96      0.98      0.97       179

    accuracy                           0.98       512
   macro avg       0.97      0.98      0.97       512
weighted avg       0.98      0.98      0.98       512



In [21]:
report_test = classification_report(y_test, pred_test)
print(report_test)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       111
           1       0.94      1.00      0.97        60

    accuracy                           0.98       171
   macro avg       0.97      0.98      0.97       171
weighted avg       0.98      0.98      0.98       171



### Regression

In [22]:
import pandas as pd 
data2 = pd.read_csv("../Data/house_price.csv", encoding="utf-8")
X = data2.iloc[:, 1:5]
y = data2[["house_value"]]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 410)

minmax = MinMaxScaler()
minmax.fit(X_train)
X_scaled_train = minmax.transform(X_train)
X_scaled_test = minmax.transform(X_test)

In [24]:
from sklearn.linear_model import * 
from sklearn.neighbors import * 
from sklearn.ensemble import * 

estimators = [('lr', LinearRegression()), ('knn', KNeighborsRegressor())]
model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10, random_state=410))
model.fit(X_scaled_train, y_train)
pred_train = model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.5515068528461464

In [25]:
pred_test = model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.46821087868350253

In [26]:
help(mean_squared_error)

Help on function mean_squared_error in module sklearn.metrics._regression:

mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True)
    Mean squared error regression loss.
    
    Read more in the :ref:`User Guide <mean_squared_error>`.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.
    
    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.
    
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.
    
    multioutput : {'raw_values', 'uniform_average'} or array-like of shape             (n_outputs,), default='uniform_average'
        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.
    
        'raw_values' :
            Returns a full set of errors in case of multioutput input.
    


In [27]:
import numpy as np 
MSE_train = mean_squared_error(y_train, pred_train)
MSE_test = mean_squared_error(y_test, pred_test)

print("학습 데이터 RMSE : ", np.sqrt(MSE_train))
print("테스트 데이터 RMSE : ", np.sqrt(MSE_test))

학습 데이터 RMSE :  63534.97154496316
테스트 데이터 RMSE :  70915.58690460226
