# stacking 算法

## base model 类

In [4]:
from abc import abstractmethod, ABCMeta

# base model
class BaseModel(metaclass=ABCMeta):
    
    @abstractmethod
    def fit(self, x_train, y_train, x_val, y_val):
        pass
    
    @abstractmethod
    def predict(self, x):
        pass
    
    def load_model(self, model_file):
        pass
    
    def save_model(self, model_file):
        pass

## stacking 类

In [60]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# stacking
class Stacking(BaseEstimator, TransformerMixin, RegressorMixin):
    def __init__(self, base_models, meta_model, k_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.k_folds = k_folds
    
    def train(self, x_train, y_train):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        
        kfold = KFold(n_splits=self.k_folds, shuffle=True, random_state=2019)

        # 使用K-fold的方法来进行交叉验证，将每次验证的结果作为新的特征来进行处理
        out_of_fold_predictions = np.zeros((x_train.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(x_train, y_train):
                instance = clone(model)
                # 保存 base_model 中每个 model 的实例
                self.base_models_[i].append(instance)
                # 训练 model 实例
                instance.fit(x_train[train_index],  y_train[train_index])
                # 预测 k_fold 中 validation 那部分
                y_pred = instance.predict(x_train[holdout_index])
                # 保存所有 model 的结果作为下层训练数据
                out_of_fold_predictions[holdout_index, i] = y_pred

        # 将交叉验证预测出的结果 和 训练集中的标签值进行训练
        self.meta_model_.fit(out_of_fold_predictions, y_train)
        return self
        
    def predict(self, x_predict):
        # 对于每个 base_model 的 k 个 models 的预测结果求均值作为该 base_model 的预测结果
        # 将所有的 base_model 的预测结果拼接起来作为 meta_model 的 feature
        meta_features = np.column_stack([
            np.column_stack([model.predict(x_predict) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_])
        print(meta_features)
        
        return self.meta_model_.predict(meta_features)

## 保存和加载模型

In [8]:
import joblib

def save_model(cls, model_file):
    joblib.dump(cls, model_file)
    
def load_model(model_file):
    cls = joblib.load(model_file)
    return cls

## stacking 的使用

### 加载数据集

In [27]:
# 数据
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris=load_iris()
print('iris keys: ', iris.keys())

x_data = iris.data
y_data = iris.target

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=2019)
print('x_train shape: ', x_train.shape, ' y_train shape: ', y_train.shape)
print('x_test shape: ', x_test.shape, ' y_test shape: ', y_test.shape)

iris keys:  dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
x_train shape:  (105, 4)  y_train shape:  (105,)
x_test shape:  (45, 4)  y_test shape:  (45,)


### SVM

In [28]:
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear')  # kernel = 'linear'

svm_clf.fit(x_train, y_train)

score = svm_clf.score(x_test, y_test)
print("The score of linear is : %f" % score)

The score of linear is : 1.000000


### LR

In [40]:
from sklearn.linear_model import LogisticRegression as LR

lr_clf = LR(multi_class='ovr')

lr_clf.fit(x_train, y_train)

score = lr_clf.score(x_test, y_test)
print("The score of linear is : %f" % score)

The score of linear is : 0.933333




### LightGBM

In [39]:
import lightgbm as lgb

params_sklearn = {
    'learning_rate':0.1,
    'max_bin':150,
    'num_leaves':32,    
    'max_depth':11,
    
    'reg_alpha':0.1,
    'reg_lambda':0.2,   
     
    'objective':'multiclass',
    'n_estimators':300,
    #'class_weight':weight
}

lgb_clf = lgb.LGBMClassifier(**params_sklearn)

lgb_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score

y_pred = lgb_clf.predict(x_test)
print(y_pred, y_test)
print(accuracy_score(y_test, y_pred))

[0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 2 0 1 0 2 1] [0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 1 0 1 0 2 1]
0.9777777777777777


### XGBoost

In [45]:
# import xgboost as xgb
from xgboost.sklearn import XGBClassifier

params = {
    'eta': 0.3,
    'max_depth':3,   
    'min_child_weight':1,
    'gamma':0.3, 
    'subsample':0.8,
    'colsample_bytree':0.8,
    'booster':'gbtree',
    'objective': 'multiclass',
    'lambda':1,  
    'seed':2019,
}

xgb_clf = XGBClassifier(**params)

xgb_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score

y_pred = xgb_clf.predict(x_test)
print(y_pred, y_test)
print(accuracy_score(y_test, y_pred))

[0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 2 0 1 0 2 1] [0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 1 0 1 0 2 1]
0.9777777777777777


### 使用 stacking 

In [61]:
import numpy as np

stacking_model = Stacking(base_models=[lr_clf, lgb_clf, svm_clf], meta_model=xgb_clf, k_folds=5)

stacking_model.train(x_train, y_train)



Stacking(base_models=[LogisticRegression(C=1.0, class_weight=None, dual=False,
                                         fit_intercept=True,
                                         intercept_scaling=1, l1_ratio=None,
                                         max_iter=100, multi_class='ovr',
                                         n_jobs=None, penalty='l2',
                                         random_state=None, solver='warn',
                                         tol=0.0001, verbose=0,
                                         warm_start=False),
                      LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='sp...
         meta_model=XGBClassifier(base_score=0.5, booster='gbtree',
                                  colsample_bylevel=1, colsample_bynode=1,
                                  colsample_bytree=0.8, eta=0.3, gamma=0.3,
                          

In [62]:
from sklearn.metrics import accuracy_score

y_pred = stacking_model.predict(x_test)
print(y_pred, y_test)
print(accuracy_score(y_test, y_pred))

[[0.  0.  0. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [1.  1.  1. ]
 [1.6 2.  2. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [0.  0.  0. ]
 [2.  1.2 1. ]
 [2.  2.  2. ]
 [1.  1.8 2. ]
 [1.6 2.  2. ]
 [2.  2.  2. ]
 [1.  1.  1. ]
 [0.  0.  0. ]
 [1.  1.  1. ]
 [2.  2.  2. ]
 [1.  1.  1. ]
 [0.  0.  0. ]
 [1.8 2.  2. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [0.  0.  0. ]
 [1.  1.  1. ]
 [0.  0.  0. ]
 [0.  0.  0. ]
 [1.  1.  1. ]
 [1.8 2.  2. ]
 [0.  0.  0. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [0.  0.  0. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [2.  2.  2. ]
 [0.  0.  0. ]
 [0.  0.  0. ]
 [2.  2.  2. ]
 [0.  0.  0. ]
 [1.  1.8 1. ]
 [0.  0.  0. ]
 [1.  1.  1. ]
 [0.  0.  0. ]
 [1.4 2.  2. ]
 [1.8 1.  1. ]]
[0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 1 0 1 0 2 1] [0 0 2 1 2 0 2 0 1 2 2 2 2 1 0 1 2 1 0 2 0 2 0 1 0 0 1 2 0 0 2 0 0 2 2 0 0
 2 0 1 0 1 0 2 1]
1.0
