In [None]:
import pandas as pd
train_csv = pd.read_csv("../input/gdrc-2021/train.csv",index_col=0)
train_csv

In [None]:
X, y = train_csv.drop(["flag"],axis=1) , train_csv["flag"]

In [None]:
del train_csv

In [None]:
X

In [None]:
X.describe()

In [None]:
X.shape, y.shape

In [None]:
(y == 1).sum(), (y == 0).sum()

In [None]:
# 正样本的数目少于负样本。
train_negative_num = (y == 0).sum()
train_positive_num = (y == 1).sum()

scale_pos_weight = round(train_negative_num / train_positive_num, 2)
scale_pos_weight

# xgboost 基础版

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

In [None]:
seed = 7
test_size = 0.33

In [None]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [None]:
def fit(X, y, params):
    model = XGBClassifier(**params)
    model.fit(X, y)
    return model

In [None]:
params = {"tree_method" : "gpu_hist", 
          "objective" : "binary:logistic",
          "scale_pos_weight" : 1, 
          "reg_alpha" : 1,
#           'metric':'auc',
          'eval_metric':'auc',
          "use_label_encoder": False}

model = fit(X_train, y_train, params)
print(model)

In [None]:
from sklearn import metrics
def predict(model, X, y):
    # make predictions for X
    y_pred = model.predict(X)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y, predictions)
    auc = metrics.roc_auc_score(y, predictions)
    f1 = metrics.f1_score(y, predictions)
    return predictions, accuracy, auc, f1

In [None]:
_, accuracy, auc, f1 = predict(model, X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("AUC:", auc)
print("f1:", f1)
# Accuracy: 96.91% 基本版，不平衡

# 保存和加载模型

In [None]:
import pickle

In [None]:
# save model to file
def save_model(model, name = ""):
    pickle.dump(model, open(name + ".model", "wb"))

In [None]:
# save model to file
save_model(model, "basic")

In [None]:
# load model from file
select_model = pickle.load(open("basic.model", "rb"))

# 可视化树的模型

In [None]:
from graphviz import Digraph
from xgboost import plot_tree

In [None]:
fig,ax = plt.subplots()
fig.set_size_inches(200,50)
plot_tree(model,ax = ax)
plt.savefig('Tree from Top to Bottom.png',bbox_inches='tight')

# 交叉验证

交叉验证在选取超参数时非常重要，首先载入 KFold, StratifiedKFold 和 cross_val_score。其中

* KFold 适用于二分类且类别平衡
* StratifiedKFold 适用于多分类或类别不平衡
* cross_val_score 计算一些指标

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

这里用 5 折交叉验证，分别用 KFold 和 StratifiedKFold 来跑。

In [None]:
# # CV model for binary class or balanced class
# kfold = KFold(n_splits=5)
# results = cross_val_score(model, X, y, cv=kfold)
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# # Accuracy: 87.62% (20.59%)
# # Accuracy: 84.88% (19.23%) 用了 scale_pos_weight

In [None]:
# # CV model for multi-class or inbalanced class
# kfold = StratifiedKFold(n_splits=5)
# results = cross_val_score(model, X, y, cv=kfold, scoring="f1")
# # print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# print("F1: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# # Accuracy: 96.92% (0.02%)
# # Accuracy: 95.23% (0.04%) 用了 scale_pos_weight
# # f1 85.71% (0.16%)

# 特征选择

In [None]:
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel

In [None]:
# feature importance
print(model.feature_importances_)
# # feature
# feature = X.columns.tolist()
# print(feature)

In [None]:
# # mannually plot
# df = pd.DataFrame(data=model.feature_importances_, index=feature).T
# df.plot.bar(figsize=(12,6));

In [None]:
# thresholds = np.unique(np.sort(model.feature_importances_))
thresholds = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
for c in thresholds:
    selection = SelectFromModel(model, threshold=c, prefit=True)
    select_X_train = selection.transform(X_train)
    select_X_test = selection.transform(X_test)
    # train model
    selection_model = fit(select_X_train, y_train, params)
    # eval model
    _, accuracy, auc, f1 = predict(selection_model, select_X_test, y_test)
#     print("Threshold = %f, n = %d, Accuracy: %.2f%%" % (c, select_X_train.shape[1], accuracy*100.0))
    print(f"Threshold = {c}, n = {select_X_train.shape[1]}, Accuracy: {accuracy}, AUC: {auc}, f1: {f1}")
    
# Threshold = 1e-06, n = 255, Accuracy: 0.9691210575139146, AUC: 0.9210358357663757, f1: 0.8565374324348592
# Threshold = 1e-05, n = 255, Accuracy: 0.9691210575139146, AUC: 0.9210358357663757, f1: 0.8565374324348592
# Threshold = 0.0001, n = 254, Accuracy: 0.969515306122449, AUC: 0.9211297368251411, f1: 0.8580683474599146
# Threshold = 0.001, n = 89, Accuracy: 0.9693568336425479, AUC: 0.9210568475063077, f1: 0.8574408401064518
# Threshold = 0.01, n = 6, Accuracy: 0.9562654607297465, AUC: 0.8923275690520918, f1: 0.7990266602724642
# Threshold = 0.1, n = 1, Accuracy: 0.9201530612244898, AUC: 0.9459398223696546, f1: 0.7243909597886704

In [None]:
# 这里选择一个折中的方案，取 Threshold = 0.001, n = 89 可以保证损失精度较少的情况下排除尽可能多的特征
selection = SelectFromModel(model, threshold=1e-3, prefit=True)
X_select = selection.transform(X)
X_select.shape, y.shape

In [None]:
del selection_model, select_X_train, select_X_test, model

In [None]:
del X_train, X_test, y_train, y_test

# 调节超参数

XGBoost 的设置有三种参数：一般参数，提升参数和学习参数。

* 一般参数 取决于提升器，通常是树或线性模型
* 提升参数 取决于选择的提升器的相关参数
* 学习参数 取决于指定学习任务和相应的学习目标

## 一般参数 (general parameters)

* booster：选择提升器，默认是 tree
* silent：是否打印信息，默认是 0 不打印
* nthread：线程数，默认为最大可用线程数
* num_pbuffer：缓冲区大小，默认为训练实例的数量
* num_feature：特征纬度，默认为特征的最高纬

## 提升参数 (booster parameters)

* eta：学习率，范围 [0, 1]，默认为 0.3。该参数越小，计算速度越慢；该参数越大，有可能无法收敛
* gamma：控制叶子个数的参数，范围 [0, +∞)，默认为 0。该参数越大，越不容易过拟合
* max_depth：每颗树的最大深度，范围 [0, +∞)，默认为 6。该参数越大，越容易过拟合
* min_child_weight：每个叶子里面的最小权重和，范围 [0, +∞)，默认为 1。该参数越大，越不容易过拟合
* subsample：样本采样比率，范围 (0, 1]，默认为 1。如果取 0.5 代表随机用 50% 的样本集用来训练
* colsample_bytree：列采样比率，范围 (0, 1]，默认为 1。对每棵树的生成用的特征进行列采样，类似于随机森林的列采样
* lambda：L2 正则化参数，范围 [0, +∞)，默认为 1。该参数越大，越不容易过拟合。
* alpha：L1 正则化参数，范围 [0, +∞)，默认为 0。该参数越大，越不容易过拟合。
* scale_pos_weight：控制正反类的平衡参数，范围 [0, +∞)，默认为 1。该参数通常设为“反类的总和/正类的总和”

## 学习参数 (learning parameters)

* objective：损失函数，默认为 linear。其他常见类型有：
* reg:logistic – 二分类
* binary:logistic – 二分类概率
* multi:softmax – 多分类
* multi:softprob – 多分类概率
* rank:pairwise – 排序
* base_score：预测分数，默认为 0.5。最初每个样例的预测分数。
* eval_metric：评估指标。该指标用在验证集上，比如回归任务默认的是 rmse；分类任务默认为 error；排序任务默认为 map。其他常见类型有：
* rmse – root mean square error
* mae – mean absolute error
* logloss – negative log-likelihood
* error – binary classification error rate
* merror – multiclass classification error rate
* mlogloss – multiclass logloss
* auc – area under the curve
* map – mean average precision
* seed：随机种子，默认为 0，用于产生可复现的结果

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {"tree_method" : "gpu_hist", 
          "objective" : "binary:logistic",
#           "scale_pos_weight" : 1, 
#           "reg_alpha" : 1,
#           'metric':'auc',
          'eval_metric':'auc',
          "use_label_encoder": False}
model = XGBClassifier(**params)

## GridSearchCV（粗调）
XGBoost 整个过程就是一个按顺序加树的过程，因此树的个数和树的深度绝对算是一组重要的超参数。除此之外，本次给定的数据集存在正负样本不均衡的问题，因此正负样本的权重也被算作是一个超参数

1. 树的个数
1. 树的深度
1. 学习率
1. 正负样本权重

In [None]:
def plot(grid_result, param_grid, param, flods, log = False):
    print('最优参数：',grid_result.best_params_)
    print('最佳模型得分：',grid_result.best_score_)
    plt.figure(figsize=[24,8])
    plt.subplot(1,2,1)
    plt.plot(param_grid[param], grid_result.cv_results_["mean_test_score"])
    plt.xlabel(param)
    plt.ylabel("f1")
    if log:
        plt.xscale('log')

    plt.subplot(1,2,2)
    plt.boxplot([np.array([grid_result.cv_results_[f"split{i}_test_score"] for i in range(5)])[:,j] for j in range(len(param_grid[param]))], labels = param_grid[param])
    plt.xlabel(param)
    plt.ylabel("f1")
    plt.show()

### 调树的个数

In [None]:
param_grid = {
    "n_estimators" : range(50, 1000, 100),
#     "max_depth" : range(1, 16, 2),
#     "learning_rate" : [0.0001, 0.001, 0.01, 0.1]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="f1", n_jobs=-1, cv=kfold, verbose=3)
grid_result = grid_search.fit(X_select, y)

In [None]:
plot(grid_result, param_grid, "n_estimators", flods=5)

### 调树的深度

In [None]:
param_grid = {
#     "n_estimators" : range(50, 1000, 100),
    "max_depth" : range(1, 16, 2),
#     "learning_rate" : [0.0001, 0.001, 0.01, 0.1]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="f1", n_jobs=-1, cv=kfold, verbose=3)
grid_result = grid_search.fit(X_select, y)

In [None]:
plot(grid_result, param_grid, "max_depth", flods=5)

### 调学习率

In [None]:
param_grid = {
#     "n_estimators" : range(50, 1000, 100),
#     "max_depth" : range(1, 16, 2),
    "learning_rate" : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="f1", n_jobs=-1, cv=kfold, verbose=3)
grid_result = grid_search.fit(X_select, y)

In [None]:
plot(grid_result, param_grid, "learning_rate", flods=5, log=True)

### 正负样本权重

In [None]:
param_grid = {
#     "n_estimators" : range(50, 1000, 100),
#     "max_depth" : range(1, 16, 2),
#     "learning_rate" : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]
    "scale_pos_weight" : [1e-2, 1e-1, 1, 2, 5, 10]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="f1", n_jobs=-1, cv=kfold, verbose=3)
grid_result = grid_search.fit(X_select, y)

In [None]:
plot(grid_result, param_grid, "scale_pos_weight", flods=5, log=True)

## 确定精调的搜索范围为
* n_estimators : [300, 400]
* max_depth : [8, 12]
* learning_rate : [0.1, 0.5]
* scale_pos_weight : [1, 3]

# skopt（精调）

根据粗调给定的参数范围进行细微调整

In [None]:
# use hyperopt and your search space would be something like this
# space ={
# 'max_depth': hp.choice("max_depth", np.arange(2,15,1)),
# 'min_child_weight': hp.quniform ('min_child_weight', 1, 20, 1),
# 'subsample': hp.uniform ('subsample', 0.5 ,1),
# 'gamma' : hp.uniform ('gamma', 0,0.5),
# 'colsample_bytree' : hp.uniform ('colsample_bytree', 0.4,0.99),
# 'reg_lambda' : hp.uniform ('reg_lambda', 0,10),
# 'reg_alpha':hp.uniform('reg_alpha',10,80),
# 'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
# 'n_estimators':hp.quniform('n_estimators',60,300,1)
# }

# for more details you can see hyperopt documentation

In [None]:
from skopt.space import Real, Integer, Categorical
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.plots import plot_objective, plot_evaluations
from skopt import callbacks
from skopt.callbacks import CheckpointSaver

n_features = X_select.shape[1]
# The list of hyper-parameters we want to optimize. For each one we define the
# bounds, the corresponding scikit-learn parameter name, as well as how to
# sample values from that dimension (`'log-uniform'` for the learning rate)
space  = [
#           Integer(1, 10, name="scale_pos_weight"),
          Real(1, 3, name='scale_pos_weight'),
          Integer(8, 12, name='max_depth'),
#           Integer(1, 20, name="min_child_weight"),
          Real(0.5, 1, name="subsample"),
          Real(0, 0.5, name="gamma"),
          Real(0.4, 1, name="colsample_bytree"),
          Integer(0, 10, name="reg_lambda"),
          Integer(0, 10, name="reg_alpha"),
          Integer(300, 400, name="n_estimators"),
#           Real(1e-5, 1e0, "log-uniform", name='learning_rate'),
          Real(0.1, 0.5, name='learning_rate'),
#           Categorical([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1], name='learning_rate'),
#           Integer(1, n_features, name='max_features'),
#           Integer(2, 100, name='min_samples_split'),
#           Integer(1, 100, name='min_samples_leaf')
         ]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set
# scikit-learn estimator parameters
@use_named_args(space)
def tune_xgbc(**params):
    params["tree_method"] = "gpu_hist"
    params["objective"] = "binary:logistic"
    params["use_label_encoder"] = False
    params['eval_metric'] = "auc"

    model = XGBClassifier(**params)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
    results = cross_val_score(model, X_select, y, cv=kfold, scoring="f1")
    return - round(results.mean(),3)

In [None]:
checkpoint_saver = CheckpointSaver("./checkpoint_tune_xgbc.pkl", compress=9) # keyword arguments will be passed to `skopt.dump`
res_gp = gp_minimize(tune_xgbc, space, n_calls=200, random_state=0, callback=[checkpoint_saver], verbose=True)

In [None]:
%matplotlib inline
plot_objective(res_gp)
plt.show()

In [None]:
%matplotlib inline
plot_evaluations(res_gp)
plt.show()

In [None]:
# # 加载训练进程继续训练
# from skopt import load
# res = load('./checkpoint_tune_xgbc.pkl')
# res.fun
# x0 = res.x_iters
# y0 = res.func_vals
# checkpoint_saver = CheckpointSaver("./checkpoint_tune_xgbc.pkl, compress=9) # keyword arguments will be passed to `skopt.dump`
# res_gp = gp_minimize(tune_xgbc, space, n_calls=20, random_state=0, callback=[checkpoint_saver], x0=x0, y0=y0, verbose=True)

In [None]:
# 精调最优超参数
res_gp.x

# 提交答案

In [None]:
# # 粗调超参数选择
# params = {"tree_method" : "gpu_hist", 
#           "objective" : "binary:logistic",
#           'n_estimators': 350,
#           'max_depth': 11,
#           'eval_metric':'auc',
#           "use_label_encoder": False}
# model = XGBClassifier(**params)
# kfold = StratifiedKFold(n_splits=5)
# results = cross_val_score(model, X_select, y, cv=kfold, scoring="f1")
# # print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# print("F1: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# # F1: 86.17% (0.12%) train all
# # F1: 86.09% (0.22%) train select 

In [None]:
# 精调最优超参数
params = {space[i].name : res_gp.x[i] for i in range(len(space))}
print(params)
params["tree_method"] = "gpu_hist"
params["objective"] = "binary:logistic"
params["use_label_encoder"] = False
params['eval_metric'] = "auc"

model = XGBClassifier(**params)
kfold = StratifiedKFold(n_splits=5)
results = cross_val_score(model, X_select, y, cv=kfold, scoring="f1")
# print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
print("F1: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model.fit(X_select, y)

In [None]:
save_model(model, "final")

In [None]:
del X, y, X_select

In [None]:
import gc
gc.collect()

In [None]:
test_csv = pd.read_csv("../input/gdrc-2021/test.csv", index_col=0)
test_csv

In [None]:
X_select = selection.transform(test_csv)
X_select.shape

In [None]:
test_label = pd.read_csv("../input/gdrc-2021/test_label.csv")
test_label

In [None]:
y, accuracy, auc, f1 = predict(model, X_select, test_label)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("AUC:", auc)
print("f1:", f1)

In [None]:
test_csv.index

In [None]:
result = pd.DataFrame(data=y, index = test_csv.index)
result

In [None]:
result.to_csv("result.csv")

In [None]:
pd.read_csv("./result.csv", index_col=0)

# 参考文献
https://zhuanlan.zhihu.com/p/33948430

https://zhuanlan.zhihu.com/p/31182879