In [None]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

# 3 XGBoost模型训练

In [None]:
train = pd.read_csv("../input/quora-question-pairs-feature-extraction-2/train.csv")
test = pd.read_csv("../input/quora-question-pairs-feature-extraction-2/test.csv")
trainlabel = pd.read_csv("../input/quora-question-pairs-feature-extraction-2/trainlabel.csv")

In [None]:
dtrain = xgb.DMatrix(train, label = trainlabel)

## 3.1 训练集和测试集具有不同的分布

留意到训练集的样本分布于测试集的有所不同。

训练集正例的比例是0.3691，将这个训练集均值作为对测试集的预测上传，Public Leaderboard上的得分为0.55410，Private Leaderboard上的得分为0.55525。根据对数损失的计算方法，可以逆推出Public的正例比例为0.174247，private的正例比例为0.176394。为了使得Public和Private榜上的得分尽可能接近，直接取两者平均，得0.175320，作为对测试集正例的估计，基于此对训练集进行适当调整。

In [None]:
p = 0.369197853026293
pos_public = (0.55410 + np.log(1 - p)) / np.log((1 - p) / p)
pos_private = (0.55525 + np.log(1 - p)) / np.log((1 - p) / p)
average = (pos_public + pos_private) / 2
print (pos_public, pos_private, average)

XGBoost有scale_pos_weight参数可用于调整正负样本的权重，很容易计算，要等价于让训练集正例的数量从0.3691下降至0.1753，应让正例的权重变为原来的0.3632。

In [None]:
w0 = average * (1 - p) / ((1 - average) * p)
print(w0)

另一方面，虽然训练时logloss的计算的却会根据scale_pos_weight有所调整，也就是说scale_pos_weight参数的确可以影响到XGBoost的训练过程，但对于验证过程，用于交叉验证的xgb.cv函数却没有这样的效果，该函数的交叉验证logloss仍然是根据正负例等权重的方法计算得到。因此，本文重新定义了一个加权对数损失函数，输入给xgb.cv函数的feval参数用于交叉验证的损失计算，而训练过程中的损失函数则无需自定义计算。

要等价于让正例的数量从0.3691下降到0.1753，而正负例的总量维持不变，应对正例和负例分别施加0.4749的1.3074的权重。

In [None]:
w1 = average / p
w2 = (1 - average) / (1 - p)
print(w1, w2)

In [None]:
def weighted_log_loss(preds, dtrain):
    label = dtrain.get_label()
    return "weighted_logloss", -np.mean(w1 * label * np.log(preds) + w2 * (1 - label) * np.log(1 - preds))

## 3.2 XGBoost参数调整

参数调整的流程如下：

num_boost_round → max_depth & min_child_weight → max_bin(因为有GPU加速) → subsample & colsample_bytree → eta & num_boost_round

## 3.2.1 调整基学习器数量num_boost_round

In [None]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.1
params["max_depth"] = 6
params["min_child_weight"] = 1
params["gamma"] = 0
params["subsample"] = 0.8
params["colsample_bytree"] = 0.9
params["scale_pos_weight"] = 0.3632
params["tree_method"] = "gpu_hist"  # 使用GPU加速的直方图算法
params['max_bin'] = 256

model1 = xgb.cv(params, dtrain, num_boost_round = 2000, nfold = 10, 
                feval = weighted_log_loss, early_stopping_rounds = 200, 
                verbose_eval = 50)

基学习器数量大约为650棵，因为验证集的加权对数损失test-weighted_logloss在650棵树以后出现了上升。

## 3.2.2 调整最大深度max_depth和min_weight

交叉验证通常可使用Scikit-Learn的GridSearchCV函数，但我在XGBoost的Scikit-Learn API参数页里没找到GPU相关选项，强行让tree_method=“gpu_hist”又报错，所以我怀疑GridSearchCV不支持GPU使用，被迫使用for循环，并用evaluation_depth_childweight记录下所有的交叉验证历史。

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.1
fix_params["gamma"] = 0
fix_params["subsample"] = 0.8
fix_params["colsample_bytree"] = 0.9
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_bin"] = 256

evaluation_list = []
for depth in [5, 6]:
    for child_weight in [1, 2.5, 4]:
        params = {**fix_params, **{"max_depth": depth, "min_child_weight": child_weight}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                            feval = weighted_log_loss, early_stopping_rounds = 100)
        # evaluation记录了每一轮迭代的交叉验证结果
        evaluation_list.append(evaluation)
        
for depth in [7, 8]:
    for child_weight in [4, 5, 6]:
        params = {**fix_params, **{"max_depth": depth, "min_child_weight": child_weight}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                            feval = weighted_log_loss, early_stopping_rounds = 100)
        # evaluation记录了每一轮迭代的交叉验证结果
        evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    # evaluation的最后一行即相应参数组合的结果
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

重点观察上表第3行test-weighted_logloss-mean，代表了在验证集上的加权对数损失。第7列至第12列的模型因为早停机制在650棵树前提前停止训练，对比前6列结果可知，第6列最优，对数损失为0.190239，代表了max_depth=6, min_child_weight=4的组合。围绕这个组合继续构建搜索范围。

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.1
fix_params["gamma"] = 0
fix_params["subsample"] = 0.8
fix_params["colsample_bytree"] = 0.9
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_bin"] = 256

evaluation_list = []
for depth in [5, 6, 7]:
    for child_weight in [3, 3.5, 4, 4.5]:
        params = {**fix_params, **{"max_depth": depth, "min_child_weight": child_weight}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                            feval = weighted_log_loss, early_stopping_rounds = 100)
        evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

从上表可见，第7列最优，max_depth=6, min_child_weight=4的组合的确表现最好。

## 3.2.3 调整直方图最大箱子数max_bin

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.1
fix_params["gamma"] = 0
fix_params["subsample"] = 0.8
fix_params["colsample_bytree"] = 0.9
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_depth"] = 6
fix_params["min_child_weight"] = 4

evaluation_list = []
for bin in [200, 230, 256, 280]:
    params = {**fix_params, **{"max_bin": bin}}
    evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                        feval = weighted_log_loss, early_stopping_rounds = 100)
    evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

由上表可见，默认参数max_bin=256为最优值，但箱子数从200变化到230再变化到256时，对数损失先升后降，并不稳定，于是本文在这个范围内又选取了多个值进行尝试。

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.08
fix_params["gamma"] = 0
fix_params["subsample"] = 0.8
fix_params["colsample_bytree"] = 0.9
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_depth"] = 6
fix_params["min_child_weight"] = 3.5

evaluation_list = []
for bin in [220, 240, 270]:
    params = {**fix_params, **{"max_bin": bin}}
    evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                        feval = weighted_log_loss, early_stopping_rounds = 100)
    evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

比较两轮调参的结果，选取max_bin=256的确最优。

## 3.2.4 调整行采样率subsample和列采样率colsample_bytree

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.1
fix_params["gamma"] = 0
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_depth"] = 6
fix_params["min_child_weight"] = 4
fix_params["max_bin"] = 256

evaluation_list = []
for row in [0.7, 0.8, 0.9]:
    for col in [0.7, 0.8, 0.9]:
        params = {**fix_params, **{"subsample": row, "colsample_bytree": col}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                            feval = weighted_log_loss, early_stopping_rounds = 100)
        evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

由上表可见，第6列的组合最优，行采样率取0.8，列采样率取0.9，进一步构造搜索范围。

In [None]:
fix_params = {}
fix_params["objective"] = "binary:logistic"
fix_params["eval_metric"] = "logloss"
fix_params["eta"] = 0.1
fix_params["gamma"] = 0
fix_params["scale_pos_weight"] = 0.3632
fix_params["tree_method"] = "gpu_hist"
fix_params["max_depth"] = 6
fix_params["min_child_weight"] = 4
fix_params["max_bin"] = 256

evaluation_list = []
for row in [0.75, 0.8, 0.85]:
    for col in [0.85, 0.9]:
        params = {**fix_params, **{"subsample": row, "colsample_bytree": col}}
        evaluation = xgb.cv(params, dtrain, num_boost_round = 650, nfold = 6, 
                            feval = weighted_log_loss, early_stopping_rounds = 100)
        evaluation_list.append(evaluation)

evaluation_panel = pd.DataFrame()
for evaluation in evaluation_list:
    evaluation_panel = pd.concat([evaluation_panel, evaluation.iloc[-1, :]], axis = 1)
evaluation_panel

最优的行采样率为0.8，列采样率为0.9。

## 3.2.5 调整学习率eta和基学习器数量num_boost_round

In [None]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.06
params["gamma"] = 0
params["scale_pos_weight"] = 0.3632
params["tree_method"] = "gpu_hist"
params["max_depth"] = 6
params["min_child_weight"] = 4
params["max_bin"] = 256
params["subsample"] = 0.8
params["colsample_bytree"] = 0.9

model6 = xgb.cv(params, dtrain, num_boost_round = 6000, nfold = 10, 
                feval = weighted_log_loss, early_stopping_rounds = 150, 
                verbose_eval = 50)

In [None]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.04
params["gamma"] = 0
params["scale_pos_weight"] = 0.3632
params["tree_method"] = "gpu_hist"
params["max_depth"] = 6
params["min_child_weight"] = 4
params["max_bin"] = 256
params["subsample"] = 0.8
params["colsample_bytree"] = 0.9

model4 = xgb.cv(params, dtrain, num_boost_round = 6000, nfold = 10, 
                feval = weighted_log_loss, early_stopping_rounds = 150, 
                verbose_eval = 50)

In [None]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.02
params["gamma"] = 0
params["scale_pos_weight"] = 0.3632
params["tree_method"] = "gpu_hist"
params["max_depth"] = 6
params["min_child_weight"] = 4
params["max_bin"] = 256
params["subsample"] = 0.8
params["colsample_bytree"] = 0.9

model2 = xgb.cv(params, dtrain, num_boost_round = 6000, nfold = 10, 
                feval = weighted_log_loss, early_stopping_rounds = 150, 
                verbose_eval = 50)

## 3.3 训练最终模型并提交

上一节中的模型均在到达允许的最大基学习器数量前早停，观察他们在验证集上的加权对数损失，明显可见当学习率为0.02时最优。验证集的加权对数损失在4450棵树以后开始上升，为了防止过拟合，本文适当减少了基学习器数量，最终同时提交了基学习器数量为3600、3800和4100的模型。

**实测无论是在Pubic榜单还是Private榜单中，3600的表现都是最好的，**其Public Scores为0.16898，排名626，Private Scores为0.17358，排名626。

In [None]:
params = {}
params["objective"] = "binary:logistic"
params["eval_metric"] = "logloss"
params["eta"] = 0.02
params["gamma"] = 0
params["scale_pos_weight"] = 0.3632
params["tree_method"] = "gpu_hist"
params["max_depth"] = 6
params["min_child_weight"] = 4
params["max_bin"] = 256
params["subsample"] = 0.8
params["colsample_bytree"] = 0.9

dtest = xgb.DMatrix(test)

t = pd.read_csv("../input/quora-question-pairs/test.csv")

In [None]:
model = xgb.train(params, dtrain, num_boost_round = 3600)
prediction = model.predict(dtest)

sub = pd.DataFrame()
sub['test_id'] = t["test_id"]
sub['is_duplicate'] = prediction
sub.to_csv('submission3600.csv', index=False)

In [None]:
model = xgb.train(params, dtrain, num_boost_round = 3800)
prediction = model.predict(dtest)

sub = pd.DataFrame()
sub['test_id'] = t["test_id"]
sub['is_duplicate'] = prediction
sub.to_csv('submission3800.csv', index=False)

In [None]:
model = xgb.train(params, dtrain, num_boost_round = 4100)
prediction = model.predict(dtest)

sub = pd.DataFrame()
sub['test_id'] = t["test_id"]
sub['is_duplicate'] = prediction
sub.to_csv('submission4100.csv', index=False)