ref:9.4 檢測詐騙交易-堆疊法

In [3]:
# 匯入函式庫
# from sklearn.base import BaseEstimator
from copy import deepcopy
from sklearn.model_selection import KFold
import numpy as np


class Stacking():
# class Stacking(BaseEstimator):

    # 定義函式初始化
    def __init__(self, learner_levels):
        # 接收基學習器、超學習器、以及堆疊中每一層分別有多少學習器
        # 複製學習器++
        self.level_sizes = []
        self.learners = []
        self.learner_levels = learner_levels
        for learning_level in self.learner_levels:

            self.level_sizes.append(len(learning_level))
            level_learners = []
            for learner in learning_level:
                level_learners.append(deepcopy(learner))
            self.learners.append(level_learners)

    # fit 函式
    # 用第i-1層的基學習器預測值來訓練第i層的基學習器
    def fit(self, x, y):
        # 第1層基學習器的訓練資料即為原始資料
        meta_data = [x]
        meta_targets = [y]
        for i in range(len(self.learners)):
            level_size = self.level_sizes[i]

            # 建立第i層預測值的儲存空間
            data_z = np.zeros((level_size, len(x)))
            target_z = np.zeros(len(x))

            # 取得第i層訓練資料集
            train_x = meta_data[i]
            train_y = meta_targets[i]

            # 建立交叉驗證
            KF = KFold(n_splits=3)
            meta_index = 0
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):
                    # 使用前K-1折訓練第j個基學習器
                    learner = self.learners[i][j]
                    learner.fit(train_x[train_indices], train_y[train_indices])
                    # 使用第K折驗證第j個基學習器
                    predictions = learner.predict(train_x[test_indices])
                    # 儲存第K折第j個基學習器預測結果
                    data_z[j][meta_index:meta_index+len(test_indices)] = predictions

                # 儲存第i層基學習器的預測結果
                # 作為第i+1層基學習器的訓練資料
                target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
                meta_index += len(test_indices)

            # Add the data and targets to the meta data lists
            data_z = data_z.transpose()
            meta_data.append(data_z)
            meta_targets.append(target_z)


            # 使用完整的訓練資料來訓練基學習器
            for learner in self.learners[i]:
                    learner.fit(train_x, train_y)

    # predict 函式
    def predict(self, x):

        # 儲存每一層的預測
        meta_data = [x]
        for i in range(len(self.learners)):
            level_size = self.level_sizes[i]

            data_z = np.zeros((level_size, len(x)))

            test_x = meta_data[i]

            KF = KFold(n_splits=3)
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):

                    learner = self.learners[i][j]
                    predictions = learner.predict(test_x)
                    data_z[j] = predictions

            # 儲存第i層基學習器的預測結果
            # 作為第i+1層基學習器的輸入
            data_z = data_z.transpose()
            meta_data.append(data_z)

        # 傳回預測結果
        return meta_data[-1]

    # predict_proba 函式
    def predict_proba(self, x):

        # 儲存每一層的預測
        meta_data = [x]
        for i in range(len(self.learners)-1):
            level_size = self.level_sizes[i]

            data_z = np.zeros((level_size, len(x)))

            test_x = meta_data[i]

            KF = KFold(n_splits=5)
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):

                    learner = self.learners[i][j]
                    predictions = learner.predict(test_x)
                    data_z[j] = predictions

            # 儲存第i層基學習器的預測結果
            # 作為第i+1層基學習器的輸入
            data_z = data_z.transpose()
            meta_data.append(data_z)

        # 傳回預測結果
        learner = self.learners[-1][-1]
        return learner.predict_proba(meta_data[-1])

In [14]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import metrics
import numpy as np
bc = load_breast_cancer()

train_x, train_y = bc.data[:400], bc.target[:400]
test_x, test_y = bc.data[400:], bc.target[400:]

In [10]:
# --- 第 2 部分 ---
# 建立基學習器與超學習器
# 將基學習器放到串列中
# base_learners = []

knn = KNeighborsClassifier(n_neighbors=2)
# base_learners.append(knn)

dtr = DecisionTreeClassifier(max_depth=4, random_state=2)
# base_learners.append(dtr)

mlpc = MLPClassifier(hidden_layer_sizes =(100, ), random_state=2)
# base_learners.append(mlpc)

base_classifiers = [knn,dtr,mlpc]

meta_learners = [LogisticRegression(solver = 'liblinear')] # 超學習器為邏輯斯迴歸


In [11]:
ensemble = Stacking(learner_levels = [base_classifiers,
                                      meta_learners])
ensemble

# ensemble.fit(x_train, y_train)
# print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
# print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

<__main__.Stacking at 0x1665a61be08>

In [12]:
ensemble.learners

[[KNeighborsClassifier(n_neighbors=2),
  DecisionTreeClassifier(max_depth=4, random_state=2),
  MLPClassifier(random_state=2)],
 [LogisticRegression(solver='liblinear')]]

In [16]:
ensemble.fit(train_x, train_y)
print('Stacking f1', metrics.f1_score(test_y, ensemble.predict(test_x)))
print('Stacking recall', metrics.recall_score(test_y, ensemble.predict(test_x)))
print('Stacking acc', metrics.accuracy_score(test_y, ensemble.predict(test_x)))



Stacking f1 0.944
Stacking recall 0.9076923076923077
Stacking acc 0.9171597633136095


