In [None]:
# --- 第 1 部分 ---
# 載入函式庫與資料集
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import validation_curve
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

np.random.seed(123456)
data = pd.read_csv('../Data/creditcard.csv')
data.Time = (data.Time-data.Time.min())/data.Time.std()
data.Amount = (data.Amount-data.Amount.mean())/data.Amount.std()

# 把資料分為 70% 訓練資料集與 30% 測試資料集
x_train, x_test, y_train, y_test = train_test_split(data.drop('Class', axis=1).values, 
                                                    data.Class.values, 
                                                    test_size=0.3)

# --- 第 2 部分 ---
# 計算訓練資料集以及驗證資料集準確率
x, y = x_train, y_train
learner = BaggingClassifier(base_estimator = DecisionTreeClassifier(max_depth = 10),
                            oob_score=True) # default=False ;True 計算基學習器的袋外分數
# 用驗證曲線來看5到30個基學習器的自助聚合結果差異
param_range = [x for x in range(5, 31)]
train_scores, test_scores = validation_curve(
    learner, x, y,
    param_name = 'n_estimators',
    param_range = param_range,
    cv = 10,
    scoring = "f1", 
    n_jobs = -1)

# --- 第 3 部分 ---
# 對每個超參數計算模型準確率的平均數與標準差
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)


# --- 第 4 部分 ---
# 繪製折線圖
plt.figure(figsize = (8, 8))
plt.title('Validation curves')
# 繪製標準差
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="C1")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="C0")
# 繪製平均數
plt.plot(param_range, train_scores_mean, 'o-', color="C1",
         label="Training score")
plt.plot(param_range, test_scores_mean, 'o-', color="C0",
         label="Cross-validation score")

plt.xticks(param_range)
plt.xlabel('Ensemble Size')
plt.ylabel('F1 Score')
plt.legend(loc="best")

In [None]:
# --- 第 5 部分 ---
# 進行集成
# 原始訓練資料
ensemble = BaggingClassifier(n_estimators = 7,
                             base_estimator = DecisionTreeClassifier(max_depth = 10))
ensemble.fit(x_train, y_train)
print('原始訓練資料:')
print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test)))


In [None]:
# --- 第 6 部分 ---

# 過濾低相關性的資料
# 篩選特徵
threshold = 0.1

correlations = data.corr()['Class'].drop('Class')
fs = list(correlations[(abs(correlations)>threshold)].index.values)
fs.append('Class')
data = data[fs]

x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(data.drop('Class', axis = 1).values, 
                                                            data.Class.values, 
                                                            test_size = 0.3)

ensemble = BaggingClassifier(n_estimators = 7,
                             base_estimator = 
                             DecisionTreeClassifier(max_depth = 10))
ensemble.fit(x_train_f, y_train_f)
print('過濾低相關性的資料:')
print('Bagging f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Bagging recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))


In [None]:
# --- 第 7 部分 ---
# 增加最大深度，從原本的10提高到15
# 原始訓練資料
ensemble = BaggingClassifier(n_estimators = 7,
                             base_estimator = DecisionTreeClassifier(max_depth = 15))
ensemble.fit(x_train, y_train)
print('原始訓練資料:')
print('Bagging f1', metrics.f1_score(y_test, ensemble.predict(x_test)))
print('Bagging recall', metrics.recall_score(y_test, ensemble.predict(x_test)))

# 過濾低相關性的資料
ensemble = BaggingClassifier(n_estimators = 7,
                             base_estimator = 
                             DecisionTreeClassifier(max_depth = 15))
ensemble.fit(x_train_f, y_train_f)
print('過濾低相關性的資料:')
print('Bagging f1', metrics.f1_score(y_test_f, ensemble.predict(x_test_f)))
print('Bagging recall', metrics.recall_score(y_test_f, ensemble.predict(x_test_f)))