# 准备工作

+ 载入必要的包

In [1]:
library(xgboost)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(pROC)


Attaching package: 'dplyr'

The following object is masked from 'package:xgboost':

    slice

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'pROC' was built under R version 3.3.3"Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var



+ 载入数据集

In [2]:
df_train = read.csv("F:/XGBoost/data/cs-training.csv", stringsAsFactors = FALSE) %>%
  na.omit() %>%   # 删除包含缺失值的样本 
  select(-`X`)    # 删除第一列索引列

# 一共有8个参数需要调节：
+ 1 eta[默认0.3]
+ 2 nrounds通过xgb.cv和early.stop.round控制
+ 3 max_depth[默认6]
+ 4 min.child.weight[默认1]
+ 5 gamma[默认0]
+ 6 subsample[默认1]
+ 7 colsample_bytree[默认1]
+ 8 scale.pos.weight[默认1]

## 1 在较高的`eta`下，调节`nrounds`参数

In [13]:
xgb_params_1 = list(
    objective = "binary:logistic", # 二分类问题
    eval_metric = "auc",           # 用AUC作为评价指标
      
    # 设置需要调节的参数初始值
    eta = 0.1,                     # 初始值设为0.1
    max.depth = 5,                 # 取值最好在3-10之间，起始值在4-6之间都是不错的选择
    min.child.weight = 1,          # 由于是不平衡的分类问题，选取较小的值
    gamma = 0,                     # 初始值为0
    subsample = 0.8,               # 最常见的初始值，典型值的范围在0.5-0.9之间
    colsample_bytree = 0.8,        # 最常见的初始值，典型值的范围在0.5-0.9之间
    scale.pos.weight = 1           # 类别不平衡，初始值设为1
)

In [79]:
set.seed(27)
xgb_1 = xgb.cv(data = as.matrix(df_train %>% select(-SeriousDlqin2yrs)),
               label = df_train$SeriousDlqin2yrs,
               params = xgb_params_1,
               nrounds = 1000,
               
               # 固定不变的参数
               nfold = 5,                                                   # 5折交叉验证
               stratified = TRUE,                                           # 不平衡样本，分层采样
               verbose = TRUE,
               early.stop.round = 50
)

[0]	train-auc:0.814889+0.016538	test-auc:0.810415+0.024144
[1]	train-auc:0.840774+0.002353	test-auc:0.836845+0.005437
[2]	train-auc:0.845075+0.003042	test-auc:0.841190+0.004491
[3]	train-auc:0.847378+0.003305	test-auc:0.843492+0.003870
[4]	train-auc:0.849461+0.001807	test-auc:0.845182+0.005201
[5]	train-auc:0.851984+0.002282	test-auc:0.847325+0.005205
[6]	train-auc:0.853262+0.002314	test-auc:0.848689+0.005162
[7]	train-auc:0.854830+0.002242	test-auc:0.849420+0.005202
[8]	train-auc:0.856047+0.002163	test-auc:0.850001+0.005481
[9]	train-auc:0.856945+0.002388	test-auc:0.850309+0.005584
[10]	train-auc:0.858131+0.002045	test-auc:0.851080+0.005656
[11]	train-auc:0.859345+0.002245	test-auc:0.851359+0.005358
[12]	train-auc:0.860676+0.001945	test-auc:0.852244+0.005809
[13]	train-auc:0.861810+0.001692	test-auc:0.852556+0.005521
[14]	train-auc:0.862849+0.001672	test-auc:0.853108+0.005436
[15]	train-auc:0.863856+0.001513	test-auc:0.853268+0.005626
[16]	train-auc:0.864709+0.001482	test-auc:0.853403

### 在`eta`为0.1时，最优的`nrounds`为77

## 2 给定`eta`、`nrounds`，进行决策树参数调优
`max.depth` 、 `min.child.weight` 、 `gamma` 、 `subsample` 、 `colsample.bytree`

### 2.1 `max.depth` 和 `min.child.weight`参数调优

In [35]:
xgb_params_2 = list(
    objective = "binary:logistic",
    eval_metric = "auc",
    
    eta = 0.1,                     # 初始值设为0.1
    nrounds = 77,
    gamma = 0,                     # 初始值为0
    subsample = 0.8,               # 最常见的初始值，典型值的范围在0.5-0.9之间
    colsample_bytree = 0.8,        # 最常见的初始值，典型值的范围在0.5-0.9之间
    scale.pos.weight = 1           # 类别不平衡，初始值设为1
)

In [28]:
max.depth = seq(3, 9, 2)
min.child.weight = seq(1, 5, 2)
to_tune = expand.grid(max.depth = max.depth, min.child.weight = min.child.weight)

In [150]:
result = vector(mode = 'numeric', length = 4)
names(result) = c('test.auc.mean', 'test.auc.std', 'max.depth', 'min.child.weight')

for (i in seq(dim(to_tune)[1])) {
    xgb_params_2$max.depth = to_tune[i, 1]
    xgb_params_2$min.child.weight = to_tune[i, 2]
    
    set.seed(27)
    xgb_2 = xgb.cv(data = as.matrix(df_train %>% select(-SeriousDlqin2yrs)),
                   label = df_train$SeriousDlqin2yrs,
                   params = xgb_params_2,

                   nfold = 5,
                   stratified = TRUE,
                   verbose = FALSE,
                   prediction = TRUE                                            # 添加这一参数，才会输出auc
    )
    
    stats = as.data.frame(xgb_2$dt)
    stats_params = c(stats[nrow(xgb_2$dt), 3], stats[nrow(xgb_2$dt), 4], to_tune[i, 1], to_tune[i, 2])
    names(stats_params) = c('test.auc.mean', 'test.auc.std', 'max.depth', 'min.child.weight')
    
    result = rbind(result, stats_params)
}

In [168]:
result = as.data.frame(result)[-1,]
result

Unnamed: 0,test.auc.mean,test.auc.std,max.depth,min.child.weight
stats_params,0.854751,0.005685,3,1
stats_params.1,0.855423,0.005471,5,1
stats_params.2,0.853199,0.005482,7,1
stats_params.3,0.850534,0.004927,9,1
stats_params.4,0.854745,0.005654,3,3
stats_params.5,0.855603,0.005375,5,3
stats_params.6,0.853897,0.005362,7,3
stats_params.7,0.850961,0.005332,9,3
stats_params.8,0.854725,0.005492,3,5
stats_params.9,0.855361,0.005456,5,5


In [169]:
result[which.max(result$test.auc.mean),]

Unnamed: 0,test.auc.mean,test.auc.std,max.depth,min.child.weight
stats_params.5,0.855603,0.005375,5,3
