In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **一.准备工作**
## 1.1导入相关库

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## 1.2读取数据

In [None]:
data_train = pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
data_test = pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")
#看看数据读取是否成功
data_train.head()

## 1.3 对照dictionary 简单了解一下数据的含义
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  SeriousDlqin2yrs:是否有超过90天或更严重的贷款拖欠问题
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  RevolvingUtilizationOfUnsecuredLines：除去房贷车贷的贷款金额/信用卡总贷款额度
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  age：借款人的年龄
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  NumberOfTime30-59DaysPastDueNotWorse:过去两年内借款人发生30-59天的贷款拖欠问题的次数
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  DebtRatio：负债率（生活花费/总收入）
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  MonthlyIncome:月收入
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  NumberOfOpenCreditLinesAndLoans：总共贷过多少款（例如有几个车贷房贷/有几张信用卡）
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  NumberOfTime60-89DaysPastDueNotWorse:过去两年内借款人发生60-89天的贷款拖欠问题的次数
####       &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  NumberOfDependents:家属人数（除去自己）

## 1.4查看描述性统计信息 准备对数据进行预处理

In [None]:
data_train.info()

In [None]:
data_train.describe()

# **二.数据预处理**

第一列应该是序号（ID），原数据集中的标题是Unnameed，改一下看着舒服点

In [None]:
data_train = data_train.rename(columns={"Unnamed: 0":"ID"})
data_test = data_test.rename(columns={"Unnamed: 0":"ID"})
data_train.head()

对要预测的变量SeriousDlqin2yrs先作一个简单的观察

In [None]:
# 对分类结果SeriousDlqin2yrs查看
sns.countplot(x="SeriousDlqin2yrs",data=data_train)

发现数据是极度不平衡的，所以在最后使用随机森林时要注意设置**class_weight**参数来解决数据不平衡的问题

## 2.1缺失值处理

缺失值的处理方法一般包括：

- 直接使用含有缺失值的属性（不处理）；
- 删除含有缺失值的属性；
- 删除含有缺失值的样本；
- 缺失值补全：均值插补、建模预测，中位数填充等 

根据1.4中的描述性统计，训练集和测试集在NumberOfDependents和MonthlyIncome上有空缺值；

下面分别进行分析

In [None]:
data_train.isnull().sum()

### 2.1.1 MonthlyIncome
从上表中可以发现MonthlyIncome缺失值较多（29731/150000$\approx$19.8%）  
所以不能直接删除含有缺失值的样本，考虑填充缺失值。  
因为随机森林不容易过拟合且对于有大量缺失值的数据能进行有效的估计与处理，所以此处我们使用随机森林进行回归预测，

In [None]:
from sklearn.ensemble import RandomForestRegressor

#先把所有数值特征提取出来 其中第零列是id不要,最后一列还有缺失值未处理先不要
data_randomforest = data_train.iloc[:,[1,2,3,4,5,6,7,8,9,10]]

##把整个数据按MonthlyIncome是否为空缺分为两部分
known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

##确定随机森林的训练集和试验集
#注意因为known的定义里就没取第零列，所以这里对应的列数都要减一
X_know = known[:,[0,1,2,3,4,6,7,8,9]]
Y_know = known[:,[5]]
X_forpredict = unknown[:,[0,1,2,3,4,6,7,8,9]]
##训练随机森林模型
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值,原数据中收入都是整数，所以这里保留0位小数
Y_predicted = rfr.predict(X_forpredict).round(0)

##回填
data_train.loc[(data_train.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted

##看一下有没有成功
data_train.info()

对训练集也做类似填充，只需注意训练集中SeriousDlqin2yrs也是空缺值，所以随机森林中传入的变量要少一个

In [None]:
#先把所有数值特征提取出来 其中第零列是id不要,最后一列还有缺失值未处理先不要
data_randomforest = data_test.iloc[:,[2,3,4,5,6,7,8,9,10]]

##把整个数据按MonthlyIncome是否为空缺分为两部分
known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

##确定随机森林的训练集和试验集
#注意因为known的定义里没取第零和第一列，所以这里对应的列数都要减二
X_know = known[:,[0,1,2,3,5,6,7,8]]
Y_know = known[:,[4]]
X_forpredict = unknown[:,[0,1,2,3,5,6,7,8]]
##训练随机森林模型
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值,原数据中收入都是整数，所以这里保留0位小数
Y_predicted = rfr.predict(X_forpredict).round(0)

##回填
data_test.loc[(data_test.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted

##看一下有没有成功
data_test.info()

### 2.1.2 NumberOfDependents
从上表中可以发现NumberOfDependents缺失值较少（3924/150000$\approx$2.6%）  
所以考虑直接使用fillna函数，用中位数填充缺失值。

In [None]:
data_train['NumberOfDependents'].fillna(data_train['NumberOfDependents'].median(), inplace=True)
data_test['NumberOfDependents'].fillna(data_test['NumberOfDependents'].median(), inplace=True)
data_test.info()

## 2.2重复值处理
直接删除重复值即可

In [None]:
data_train = data_train.drop_duplicates()
data_train.info()
#事实上好像没有重复值咕噜咕噜

## 2.3异常值处理

异常值的处理方法一般包括：

- 删除含有异常值的样本
- 将异常值视为缺失值，应用缺失值处理方法
- 用平均值来修正
- 不处理

先做一个分位数统计

In [None]:
data_train.select_dtypes('int64').describe().transpose()[['min', '25%', '50%', '75%', 'max']]

发现其中不少特征的最大最小值都比较异常，需要进一步处理

先定义两个作图的函数，方便后续观察分析

In [None]:
from sklearn.feature_selection import mutual_info_classif
def plot_distributions_discrete(feature):
    
    _, axes = plt.subplots(1, 2, figsize=(12,4))
    sns.histplot(data_train[feature], kde=True, label='Train',stat='density',
                 discrete=True, color='steelblue', alpha=0.6, ax=axes[0])
    sns.histplot(data_test[feature], kde=True, label='Test',stat='density',
                 discrete=True, color='gold', alpha=0.25, ax=axes[0])
    axes[0].legend()
    axes[0].set_title('Distr Train set vs Distr Test set')

    sns.boxplot(x='SeriousDlqin2yrs', y=feature, data=data_train, ax=axes[1], palette=['seagreen', 'tan'])
    
    X = data_train[[feature]].dropna()
    MI = mutual_info_classif(X, data_train.loc[X.index, 'SeriousDlqin2yrs'], discrete_features=True,
                             random_state=0)
    axes[1].set_title('Distribution depending on the SeriousDlqin2yrs\n-> MI Score : ' + str(round(MI[0], 7)))

    plt.suptitle('"{}" distributions'.format(feature), y=1.15);

In [None]:
def plot_distributions_continuous(feature):
    
    _, axes = plt.subplots(1, 2, figsize=(12,4))
    sns.kdeplot(data_train[feature].apply(np.log1p), label='Train',
                color='steelblue', alpha=0.5, shade=True, edgecolor='k', ax=axes[0])
    sns.kdeplot(data_test[feature].apply(np.log1p), label='Test',
                color='gold', alpha=0.3, shade=True, edgecolor='k', ax=axes[0])

    axes[0].legend()
    axes[0].set_title('Distr Train set vs Distr Test set')
    axes[0].set_xlabel('log-{}'.format(feature))

    sns.boxplot(x='SeriousDlqin2yrs', y=data_train[feature],
                data=data_train, ax=axes[1], palette=['seagreen', 'tan'])

    X = data_train[[feature]].dropna()
    MI = mutual_info_classif(X, data_train.loc[X.index, 'SeriousDlqin2yrs'], random_state=0)
    axes[1].set_title('Distribution depending on the SeriousDlqin2yrs\n-> MI Score : ' + str(round(MI[0], 7)))
    
    plt.suptitle('"{}" distributions'.format(feature), y=1.15);

### 2.3.1 age异常值处理

In [None]:
plot_distributions_discrete('age')

从图中可以发现
- 训练集和测试集的年龄分布基本类似（蓝色是训练集，黄色是测试集，在图上叠加起来显示绿色）
- 有严重违约情况的人群年龄整体偏低
- 年龄整体符合正态分布

处理age，根据法律，20岁以上才可以办理信用卡，所以20岁以下可以认为是错误数据，看看数量

In [None]:
data_train[data_train["age"]<20]

发现只有一个案例，直接删除即可

In [None]:
data_train = data_train[data_train["age"]>=20]

### 2.3.2 DebtRatio

In [None]:
plot_distributions_continuous('DebtRatio')
plt.gca().set_ylim(0, 2);

- 负债率正常来讲不会太高，主要集中在1附近（左图中取了对数，也就是0附近）
- 考虑根据左图将取对数后大于9的离群点进行处理

In [None]:
print(data_train[data_train["DebtRatio"]>np.e**9].shape[0])

占比较小，直接删去即可

In [None]:
data_train = data_train[data_train["DebtRatio"]<=np.e**9]
#看看操作是否成功
data_train.shape[0]

### 2.3.3 MonthlyIncome
做类似分析和处理

In [None]:
plot_distributions_continuous('MonthlyIncome')
plt.gca().set_ylim(0, 2e4);

In [None]:
data_train[data_train["MonthlyIncome"]>np.e**11.5].shape[0]

In [None]:
data_train = data_train[data_train["MonthlyIncome"]<np.e**11.5]
#看看操作是否成功
data_train.shape[0]

### 2.3.4 NumberOfDependents

In [None]:
plot_distributions_discrete('NumberOfDependents')
plt.gca().set_ylim(0, 10);

In [None]:
data_train[data_train["NumberOfDependents"]>10].shape[0]

In [None]:
data_train = data_train[data_train["NumberOfDependents"]<=10]
#看看操作是否成功
data_train.shape[0]

### 2.3.5  RevolvingUtilizationOfUnsecuredLines

In [None]:
plot_distributions_continuous('RevolvingUtilizationOfUnsecuredLines')
plt.gca().set_ylim(0, 3);

In [None]:
data_train[data_train["RevolvingUtilizationOfUnsecuredLines"]>1.5].shape[0]

In [None]:
data_train = data_train[data_train["RevolvingUtilizationOfUnsecuredLines"]<=1.5]
data_train.shape[0]

### 2.3.6 Pastdue
因为三者的含义类似，所以放在一起分析

In [None]:
# 查看一下三者的箱型图
plt.figure(figsize=(20, 10)) 
data_train[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

发现离群点都是类似的，可以将他们一起删除

In [None]:
data_train = data_train[data_train['NumberOfTime30-59DaysPastDueNotWorse']<95]
data_train.shape[0]

### 2.3.6 NumberOfOpenCreditLinesAndLoans

In [None]:
plot_distributions_discrete('NumberOfOpenCreditLinesAndLoans')
plt.gca().set_ylim(0, 30);

In [None]:
data_train = data_train[data_train['NumberOfOpenCreditLinesAndLoans']<20]
data_train.shape[0]

## 2.4  查看处理后的结果

In [None]:
#大致数据分布情况
data_train.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# 检查数据的相关性
corr = data_train.corr()
plt.figure(figsize=(20, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

从相关系数图中可以看出因变量SeriousDlqin2yrs与
- RevolvingUtilizationOfUnsecuredLines
- NumberOfTime30-59DaysPastDueNotWorse
- NumberOfTime60-89DaysPastDueNotWorse
- NumberOfTime90Dayslate  
相关性较强

# 三.**变量处理**

为了方便后续处理，先将训练集，测试集上的自变量因变量划分清楚

In [None]:
X_train = data_train.iloc[:,2:]
Y_train = data_train.iloc[:,1]
X_test = data_test.iloc[:,2:]
X_test.info()

## 3.1变量分箱
分箱离散化后,可以降低异常值的影响，在分箱后，我们还计算了Woe和IV  

- Woe全称叫Weight of Evidence，常用在风险评估、授信评分卡等领域。

- IV全称是Information value，可通过woe加权求和得到，衡量自变量对应变量的预测能力。

### 3.1.1最优分箱
对于连续变量直接使用最优分箱方法即可

In [None]:
import scipy.stats as stats

def monoto_bin(Y, X, n):
    r = 0
    total_good = Y.sum()
    total_bad =Y.count()-total_good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.min().X, columns = ['min_' + X.name])
    d3['min_' + X.name] = d2.min().X
    d3['max_' + X.name] = d2.max().X
    d3[Y.name] = d2.sum().Y
    d3['total'] = d2.count().Y
    
    #好坏比，求woe,证据权重
    d3['goodattr']=d3[Y.name]/total_good
    d3['badattr']=(d3['total']-d3[Y.name])/total_bad
    d3['woe'] = np.log(d3['goodattr']/d3['badattr'])
    
    #信息值，自变量对于目标变量的影响程度
    iv = ((d3['goodattr']-d3['badattr'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min_' + X.name)).reset_index(drop = True)
    print ("=" * 80)
    print (d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1,n+1):
        qua =X.quantile(i/(n+1))
        cut.append(round(qua,4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4,iv,cut,woe

In [None]:
x1_d,x1_iv,x1_cut,x1_woe = monoto_bin(Y_train,X_train.RevolvingUtilizationOfUnsecuredLines,10)
x2_d,x2_iv,x2_cut,x2_woe = monoto_bin(Y_train,X_train.age,10)
x4_d,x4_iv,x4_cut,x4_woe = monoto_bin(Y_train,X_train.DebtRatio,10)
x5_d,x5_iv,x5_cut,x5_woe = monoto_bin(Y_train,X_train.MonthlyIncome,10)
#此处下标数对应的是第几个特征（例如x2的下标2对应他是第二个特征age）

对于不能直接最优分箱的，我们手动设置边界进行分箱，先定义一个分箱函数

In [None]:
def fenxiang(X,Y,CUT):
    #先利用pandas的cut函数给包含自变量-因变量的dataframe加上Bucket特征
    
    d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,CUT)})
    
    #再利用groupby按Bucket将原有的dataframe按Bucket特征划分成子dataframe
    
    d2=d1.groupby('Bucket', as_index = True)
    
    #创建一个空的dataframe，往里面输入参数
    d3 = pd.DataFrame({"min":d2.min().X})
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y

    ## 计算woe iv
    good=Y.sum()
    bad=Y.count()-good
    d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_values(by='min'))
    print("=" * 60)
    print(d4)
    woe = list(d4['woe'].round(3))
    return d4, iv,woe

再对剩余变量进行分箱

In [None]:
##先指定不同变量的cut
cutx3 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx6 = [-np.inf, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, np.inf]
cutx7 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx8 = [-np.inf, 0, 1, 2, 3, 4, 5, np.inf]
cutx9 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx10 = [-np.inf, 0, 1, 2, 3, 4, 5, 6, np.inf]

In [None]:
##调用函数计算
x3_d,x3_iv,x3_woe = fenxiang(X_train["NumberOfTime30-59DaysPastDueNotWorse"],Y_train,cutx3)
x6_d,x6_iv,x6_woe = fenxiang(X_train["NumberOfOpenCreditLinesAndLoans"],Y_train,cutx6)
x7_d,x7_iv,x7_woe = fenxiang(X_train["NumberOfTimes90DaysLate"],Y_train,cutx7)
x8_d,x8_iv,x8_woe = fenxiang(X_train["NumberOfOpenCreditLinesAndLoans"],Y_train,cutx8)
x9_d,x9_iv,x9_woe = fenxiang(X_train["NumberOfTime60-89DaysPastDueNotWorse"],Y_train,cutx9)
x10_d,x10_iv,x10_woe = fenxiang(X_train["NumberOfDependents"],Y_train,cutx10)

### 3.2选取变量
查看各个变量的IV值

In [None]:
# 查看各个变量的IV值
informationValue = []
informationValue.append(x1_iv)
informationValue.append(x2_iv)
informationValue.append(x3_iv)
informationValue.append(x4_iv)
informationValue.append(x5_iv)
informationValue.append(x6_iv)
informationValue.append(x7_iv)
informationValue.append(x8_iv)
informationValue.append(x9_iv)
informationValue.append(x10_iv)
informationValue
plt.figure(figsize=(20, 10)) 
index=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate','NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse','NumberOfDependents']
index_num = range(len(index))
ax=plt.bar(index_num,informationValue,tick_label=index)
plt.xticks(rotation=90) 
plt.show()

IV值可以用来衡量自变量对于因变量的响应能力，具体量化指标如下：
- $\leq$0.02:useless for prediction
- 0.02 to 0.1:Weak predictor
- 0.1 to 0.3Medium predictor
- 0.3 to 0.5Strong predictor 
- $\geq$0.5 Suspicious or too good to be true  
舍弃掉衡量能力较差的特征



In [None]:
X_train_last = X_train.drop(["NumberOfDependents"],axis = 1)
X_test_last = X_test.drop(["NumberOfDependents"],axis = 1)
X_test_last.head()

# **四.模型预测**

为了评价模型以及后续调参，我们先定义一个ROC曲线绘制函数

In [None]:
# ROC曲线绘制
def draw_roc(FPR, TPR, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(FPR, TPR,'b', linewidth=2, label=label)
    plt.plot([0,1],[0,1], "r--") 
    plt.xlim([0, 1])   
    plt.ylim([0, 1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## 4.1 RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
forest = RandomForestClassifier(n_estimators=300, max_depth=5, class_weight='balanced')
forest.fit(X_train_last, Y_train)
forest_scores_proba = forest.predict_proba(X_train_last)
forest_scores = forest_scores_proba[:,1]
FPR_forest, TPR_forest, THRESH_forest = roc_curve(Y_train, forest_scores)
AUC_forest=roc_auc_score(Y_train,forest_scores)
draw_roc(FPR_forest, TPR_forest)
print("RF在训练集上的AUC是: {:.5f}%".format(AUC_forest*100))

再通过交叉验证来检验模型的泛化能力  
取$K—Fold$中的$K=10$

In [None]:
AUC_forest_cv = cross_val_score(forest, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("RF在训练集上cv的AUC是：{:.5f}%".format(AUC_forest_cv*100))

## 4.2梯度提升

In [None]:
GBC = GradientBoostingClassifier()
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

同样通过交叉验证来检验模型的泛化能力  
取$K—Fold$中的$K=10$

In [None]:
AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("GBC在训练集上cv的AUC是：{:.5f}%".format(AUC_GBC_cv*100))

发现在两种方法中，**梯度上升**比随机森林的效果要好  
下面对梯度上升中的参数进行调参

# **五.参数调优**

## 5.1  **n_estimators**调优  

In [None]:
estimators=np.linspace(100,300,21).astype(int)
AUCs=[]
nums=[]
for estimator in estimators:
    GBC = GradientBoostingClassifier(n_estimators=estimator, learning_rate=0.1 ,max_depth=4)
    GBC.fit(X_train_last, Y_train)
    GBC_scores_proba =GBC.predict_proba(X_train_last)
    GBC_scores = GBC_scores_proba[:,1]
    FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
    AUC_GBC=roc_auc_score(Y_train,GBC_scores)
    AUCs.append(AUC_GBC*100)
    nums.append(estimator)
plt.plot(nums,AUCs)

可以发现增长越来越缓慢，为了防止过拟合，我们就选取n=250

 ## 5.2 max_depth调优

In [None]:
depths = np.linspace(1,12,12).astype(int)
AUCs=[]
maxdepths=[]
for depth in depths:
    GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth=depth)
    GBC.fit(X_train_last, Y_train)
    GBC_scores_proba =GBC.predict_proba(X_train_last)
    GBC_scores = GBC_scores_proba[:,1]
    FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
    AUC_GBC=roc_auc_score(Y_train,GBC_scores)
    AUCs.append(AUC_GBC*100)
    maxdepths.append(depth)
plt.plot(maxdepths,AUCs)

发现提升还是蛮明显的，但后面显然有一些过拟合了....所以取maxdepths=5，看一下AUC

In [None]:
GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth=5)
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

还是有一些过拟合，降低一下learning_rate

In [None]:
GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.05 ,max_depth = 5)
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

感觉还可以了 先交一下试试

# 六.提交结果

In [None]:
submission_proba = GBC.predict_proba(X_test_last)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)