In [10]:
# 匯入函式庫
# from sklearn.base import BaseEstimator
from copy import deepcopy
from sklearn.model_selection import KFold
import numpy as np


class Stacking():
# class Stacking(BaseEstimator):

    # 定義函式初始化
    def __init__(self, learner_levels):
        # 接收基學習器、超學習器、以及堆疊中每一層分別有多少學習器
        # 複製學習器++
        self.level_sizes = []
        self.learners = []
        self.learner_levels = learner_levels
        for learning_level in self.learner_levels:

            self.level_sizes.append(len(learning_level))
            level_learners = []
            for learner in learning_level:
                level_learners.append(deepcopy(learner))
            self.learners.append(level_learners)

    # fit 函式
    # 用第i-1層的基學習器預測值來訓練第i層的基學習器
    def fit(self, x, y):
        # 第1層基學習器的訓練資料即為原始資料
        meta_data = [x]
        meta_targets = [y]
        for i in range(len(self.learners)):
            level_size = self.level_sizes[i]

            # 建立第i層預測值的儲存空間
            data_z = np.zeros((level_size, len(x)))
            target_z = np.zeros(len(x))

            # 取得第i層訓練資料集
            train_x = meta_data[i]
            train_y = meta_targets[i]

            # 建立交叉驗證
            KF = KFold(n_splits=3)
            meta_index = 0
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):
                    # 使用前K-1折訓練第j個基學習器
                    learner = self.learners[i][j]
                    learner.fit(train_x[train_indices], train_y[train_indices])
                    # 使用第K折驗證第j個基學習器
                    predictions = learner.predict(train_x[test_indices])
                    # 儲存第K折第j個基學習器預測結果
                    data_z[j][meta_index:meta_index+len(test_indices)] = predictions

                # 儲存第i層基學習器的預測結果
                # 作為第i+1層基學習器的訓練資料
                target_z[meta_index:meta_index+len(test_indices)] = train_y[test_indices]
                meta_index += len(test_indices)

            # Add the data and targets to the meta data lists
            data_z = data_z.transpose()
            meta_data.append(data_z)
            meta_targets.append(target_z)


            # 使用完整的訓練資料來訓練基學習器
            for learner in self.learners[i]:
                    learner.fit(train_x, train_y)

    # predict 函式
    def predict(self, x):

        # 儲存每一層的預測
        meta_data = [x]
        for i in range(len(self.learners)):
            level_size = self.level_sizes[i]

            data_z = np.zeros((level_size, len(x)))

            test_x = meta_data[i]

            KF = KFold(n_splits=3)
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):

                    learner = self.learners[i][j]
                    predictions = learner.predict(test_x)
                    data_z[j] = predictions

            # 儲存第i層基學習器的預測結果
            # 作為第i+1層基學習器的輸入
            data_z = data_z.transpose()
            meta_data.append(data_z)

        # 傳回預測結果
        return meta_data[-1]

    # predict_proba 函式
    def predict_proba(self, x):

        # 儲存每一層的預測
        meta_data = [x]
        for i in range(len(self.learners)-1):
            level_size = self.level_sizes[i]

            data_z = np.zeros((level_size, len(x)))

            test_x = meta_data[i]

            KF = KFold(n_splits=5)
            for train_indices, test_indices in KF.split(x):
                for j in range(len(self.learners[i])):

                    learner = self.learners[i][j]
                    predictions = learner.predict(test_x)
                    data_z[j] = predictions

            # 儲存第i層基學習器的預測結果
            # 作為第i+1層基學習器的輸入
            data_z = data_z.transpose()
            meta_data.append(data_z)

        # 傳回預測結果
        learner = self.learners[-1][-1]
        return learner.predict_proba(meta_data[-1])

In [11]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
import numpy as np
import pandas as pd

#from stacking_classifier import Stacking
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

np.random.seed(123456)
data = pd.read_csv('../Data/creditcard.csv')
data.Time = (data.Time-data.Time.min())/data.Time.std()
data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()

# 把資料分為 70% 訓練資料集與 30% 測試資料集
x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, 
                                                    data.Class.values, 
                                                    test_size=0.3)


In [12]:
# --- 第 2 部分 ---
# 進行集成
base_classifiers = [DecisionTreeClassifier(max_depth = 10),
                    GaussianNB(),
                    LogisticRegression(solver = 'liblinear')]

meta_learners = [LogisticRegression(solver = 'liblinear')]

ensemble = Stacking(learner_levels = [base_classifiers,
                                      meta_learners])
ensemble.fit(x_train, y_train)
print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))


Stacking f1 0.814516129032258
Stacking recall 0.7426470588235294


# 評估原始訓練資料

In [7]:
# --- 第 3 部分 ---
# 篩選特徵
threshold = 0.1
correlations = data.corr()['Class'].drop('Class')
fs = list(correlations[(abs(correlations) > threshold)].index.values)
fs.append('Class')
data = data[fs]

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(data.drop('Class', axis=1).values, 
                                                            data.Class.values, 
                                                            test_size=0.3)
ensemble = Stacking(learner_levels = [base_classifiers,
                                      meta_learners])
ensemble.fit(x_train_f, y_train_f)
print('Stacking f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Stacking recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))


Stacking f1 0.8253968253968255
Stacking recall 0.7703703703703704


# 另添加另外2顆決策樹(最大深度分別為6和7)基學習器，評估原始訓練資料和過濾低相關性的資料集成的效能

In [8]:
# --- 第 4 部分 ---
# 增加基學習器
base_classifiers = [DecisionTreeClassifier(max_depth = 10),
                    DecisionTreeClassifier(max_depth = 7),
                    DecisionTreeClassifier(max_depth = 6),
                    GaussianNB(),
                    LogisticRegression(solver = 'liblinear')]

# 原始訓練資料
ensemble = Stacking(learner_levels = [base_classifiers,
                                      meta_learners])
ensemble.fit(x_train, y_train)
print('原始訓練資料:')
print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))


# 過濾低相關性的資料
ensemble = Stacking(learner_levels = [base_classifiers,
                                      meta_learners])
ensemble.fit(x_train_f, y_train_f)
print('過濾低相關性的資料:')
print('Stacking f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Stacking recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))

原始訓練資料:
Stacking f1 0.8455284552845528
Stacking recall 0.7647058823529411
過濾低相關性的資料:
Stacking f1 0.8494208494208493
Stacking recall 0.8148148148148148


# 除另添加另外2顆決策樹(最大深度分別為6和7)，另多堆疊一層基學習器(包含深度為2的決策樹和一個線性資源向量機)，評估原始訓練資料和過濾低相關性的資料集成的效能

In [9]:
# --- 第 5 部分 ---
# 增加一層
base_classifiers = [DecisionTreeClassifier(max_depth = 10),
                    DecisionTreeClassifier(max_depth = 7),
                    DecisionTreeClassifier(max_depth = 6),
                    GaussianNB(),
                    LogisticRegression(solver = 'liblinear')]

second_learners = [DecisionTreeClassifier(max_depth = 2),
                   LinearSVC()]

# 原始訓練資料
ensemble = Stacking(learner_levels = [base_classifiers,
                                      second_learners,
                                      meta_learners])
ensemble.fit(x_train, y_train)
print('原始訓練資料:')
print('Stacking f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Stacking recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

# 過濾低相關性的資料
ensemble = Stacking(learner_levels = [base_classifiers,
                                      second_learners,
                                      meta_learners])
ensemble.fit(x_train_f, y_train_f)
print('過濾低相關性的資料:')
print('Stacking f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Stacking recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))

原始訓練資料:
Stacking f1 0.8489795918367347
Stacking recall 0.7647058823529411
過濾低相關性的資料:
Stacking f1 0.840466926070039
Stacking recall 0.8
