In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 定义问题

**比赛描述：**
银行在市场经济中起着至关重要的作用。他们决定谁能获得资金，在什么条件下获得资金，并决定是否做出投资决定。为了让市场和社会正常运转，个人和企业需要获得信贷。
信用评分算法对违约概率进行猜测，是银行用来决定是否发放贷款的方法。这项比赛要求参赛者通过预测某人在未来两年内遭遇财务困境的可能性，来提高信用评分的技术水平。
这个竞赛的目标是建立一个模型，借款者可以利用这个模型来帮助做出最佳的财务决策。

**比赛目标：**
训练集包含一些借款人的历史数据并给出了是否有超过90天未还款的不良行为（目标值）标记，训练一个模型判断测试集中的人发生超过90天未还款的不良行为的可能性。


## **导包**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# 2.获取数据


In [None]:
data_train = pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
data_test = pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")

# 3.数据分析及预处理

## 3.1数据基本情况

查看训练集的基本情况，训练集共有150000条数据：

In [None]:
data_train.info()

先做去重处理，再查看数据，还是150000条，说明训练集没有重复数据。

In [None]:
data_train = data_train.drop_duplicates()
data_train.info()

## 3.2基本数据分析

### 3.2.1标签分布情况

训练集的150000条借款人的历史数据中，未违约样本139974条，占样本总量的93.316%，违约样本10026条，占样本总量的6.684%，贷款违约率为6.684%。可以看出该数据集是一个高度不平衡的数据，这在金融风控中是常见的，因为会存在严重违约的用户毕竟是少数。后面使用逻辑回归和随机森林时要注意设置**class_weight**参数来解决数据不平衡的问题.

In [None]:
data_train['SeriousDlqin2yrs'].value_counts()

In [None]:
sns.countplot(x="SeriousDlqin2yrs",data=data_train)

### 3.2.2数据集的特征

提供的数据字典如下

| 字段名 | 描述                                 | 类型                                            |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs（目标值）|是否有超过90天或更长时间逾期未还的不良行为|Y/N（1/0）
| RevolvingUtilizationOfUnsecuredLines | 可用额度比值                                   | percentage                                |
| age   | 年龄                               | integer                      |
| NumberOfTime30-59DaysPastDueNotWorse | 逾期30-59天笔数|integer
| DebtRatio      | 负债率                                |  percentage                                              |
| MonthlyIncome      | 月收入                       |       real                                         |
| NumberOfOpenCreditLinesAndLoans    | 信贷数量                      |integer
| NumberOfTimes90DaysLate    | 逾期90天笔数 |                          integer                      |
| NumberRealEstateLoansOrLines   | 固定资产贷款数量                              |             integer                                   |
| NumberOfTime60-89DaysPastDueNotWorse     | 逾期60-89天笔数                             |      integer                                          |
| NumberOfDependents | 家属数量                        | integer |

结合数据字典并使用train_df.head()查看前5行数据，train_df.tail()查看后5行数据，观察数据特征，可以看到全部特征都是数值型的。第一列数据名为Unnamed，实则为数据的序号，将列名改为ID。

In [None]:
data_train.head()

In [None]:
data_train.tail()

In [None]:
data_train = data_train.rename(columns={"Unnamed: 0":"ID"})
data_test = data_test.rename(columns={"Unnamed: 0":"ID"})
data_train.head()

**各个特征的数据类型**
* 四个特征是浮点型
* 八个特征是整型

In [None]:
data_train.info()
print('_'*40)
data_test.info()

### 3.2.3缺失值

* 训练集：MonthlyIncome有29731个缺失值，NumberOfDependents有3924个缺失值
* 测试集：MonthlyIncome有20103个缺失值，NumberOfDependents有2626个缺失值

In [None]:
data_train.isnull().sum()

In [None]:
data_test.isnull().sum()

### 3.2.4特征的分布

由于全部都是数值型特征，使用describe（）查看特征分布

In [None]:
data_train.describe()

观察到：
* ”NumberOfDependents“75%百分位点是1，也就是说大部分人都没有家属，选择用众数填充家属数字段的缺失值。
* 对于“MonthlyIncome”月收入的缺失值，由于月收入与银行信用关系很大，所以使用随机森林回归算法进行缺失值填补。

### 3.2.5众数填补家属数

In [None]:
data_train["NumberOfDependents"].fillna(data_train["NumberOfDependents"].mode()[0], inplace=True)
data_train.info()

In [None]:
data_test["NumberOfDependents"].fillna(data_test["NumberOfDependents"].mode()[0], inplace=True)
data_test.info()

### 3.2.6使用随机森林回归填补月收入

月收入与信用情况有很大关系，缺失量很大，不能直接删除，使用随机森林进行回归预测填充。

先对训练集的月收入进行预测填充：

In [None]:
from sklearn.ensemble import RandomForestRegressor

data_randomforest = data_train.iloc[:,[1,2,3,4,5,6,7,8,9,10]]

known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

X_know = known[:,[0,1,2,3,4,6,7,8,9]]
Y_know = known[:,[5]]
X_forpredict = unknown[:,[0,1,2,3,4,6,7,8,9]]
##随机森林回归
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值
Y_predicted = rfr.predict(X_forpredict).round(0)
data_train.loc[(data_train.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted


data_train.info()

对测试集的月收入填充，测试集中SeriousDlqin2yrs是空缺值，随机森林中的变量要少一个

In [None]:
data_randomforest = data_test.iloc[:,[2,3,4,5,6,7,8,9,10]]

known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

X_know = known[:,[0,1,2,3,5,6,7,8]]
Y_know = known[:,[4]]
X_forpredict = unknown[:,[0,1,2,3,5,6,7,8]]
##随机森林
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值
Y_predicted = rfr.predict(X_forpredict).round(0)
data_test.loc[(data_test.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted

data_test.info()

### 3.2.7年龄分布情况

在特征分布里看到年龄的最小值为0，但根据常识，小于18岁是不能在银行办理贷款业务的。
画出箱图和条形图可以看到年龄分布情况如下：

In [None]:
plt.figure(figsize=[10, 8])
plt.subplot(221)
sns.boxplot(data= data_train['age'])
plt.ylabel('age')
plt.subplot(222)
sns.histplot(data_train['age']) 
plt.xlabel('age')

根据箱图显示，年龄有0值，看一下年龄小于18岁的数据

In [None]:
data_train[data_train['age']<18]

只有一行数据年龄小于18，为0岁，显然是异常值，将这行数据删除。

In [None]:
data_train=data_train[data_train['age']!=0]
data_train[data_train['age']<18]

### 3.2.8离散型数据

**家属数量NumberOfDependents、信贷数量NumberOfOpenCreditLinesAndLoans、固定资产贷款数量NumberRealEstateLoansOrLines**的分布情况如下：

In [None]:
_, axes = plt.subplots(1, 3, figsize=(12,4))
sns.histplot(data_train['NumberOfDependents'], kde=True,stat='density',discrete=True, color='steelblue', alpha=0.6, ax=axes[0])
axes[0].legend()
sns.histplot(data_train['NumberOfOpenCreditLinesAndLoans'], kde=True,stat='density',discrete=True, color='steelblue', alpha=0.6, ax=axes[1])
axes[1].legend()
sns.histplot(data_train['NumberRealEstateLoansOrLines'], kde=True,stat='density',discrete=True, color='steelblue', alpha=0.6, ax=axes[2])
axes[2].legend()

家属数量大于10的属于异常值，有2条数据，做删除处理。

In [None]:
data_train[data_train["NumberOfDependents"]>10].shape[0]

In [None]:
data_train = data_train[data_train["NumberOfDependents"]<=10]
data_train[data_train["NumberOfDependents"]>10].shape[0]

信贷数量大于10的属于异常值，有354条数据，做删除处理。

In [None]:
data_train[data_train["NumberOfOpenCreditLinesAndLoans"]>30].shape[0]

In [None]:
data_train = data_train[data_train["NumberOfOpenCreditLinesAndLoans"]<=30]
data_train[data_train["NumberOfOpenCreditLinesAndLoans"]>30].shape[0]

固定资产贷款数量大于10的属于异常值，有76条数据，做删除处理。

In [None]:
data_train[data_train["NumberRealEstateLoansOrLines"]>10].shape[0]

In [None]:
data_train = data_train[data_train["NumberRealEstateLoansOrLines"]<=10]
data_train[data_train["NumberRealEstateLoansOrLines"]>10].shape[0]

### 3.2.9连续型数据

**负债率DebtRatio、可用额度比值RevolvingUtilizationOfUnsecuredLines、月收入MonthlyIncome**取对数画出分布情况如下：

In [None]:
_, axes = plt.subplots(1, 3, figsize=(12,4))
sns.kdeplot(data_train['DebtRatio'].apply(np.log1p),color='g', alpha=0.5, shade=True, edgecolor='k', ax=axes[0])
axes[0].legend()
sns.kdeplot(data_train['RevolvingUtilizationOfUnsecuredLines'].apply(np.log1p),color='g', alpha=0.5, shade=True, edgecolor='k', ax=axes[1])
axes[1].legend()
sns.kdeplot(data_train['MonthlyIncome'].apply(np.log1p),color='g', alpha=0.5, shade=True, edgecolor='k', ax=axes[2])
axes[2].legend()

可以看到负债率数据对数集中在9之前，查看对数大于9的数据量如下，共有307条数据，将这些离群点删除处理

In [None]:
data_train[data_train["DebtRatio"]>np.e**9].shape[0]

In [None]:
data_train = data_train[data_train["DebtRatio"]<=np.e**9]
data_train[data_train["DebtRatio"]>np.e**9].shape[0]

可用额度比值集中在1.5之前，查看对数大于1.5的数据量如下，共有599条数据，将这些离群点删除处理

In [None]:
data_train[data_train["RevolvingUtilizationOfUnsecuredLines"]>1.5].shape[0]

In [None]:
data_train = data_train[data_train["RevolvingUtilizationOfUnsecuredLines"]<=np.e**1.5]
data_train[data_train["RevolvingUtilizationOfUnsecuredLines"]>np.e**1.5].shape[0]

可以看到月收入数据对数集中在12之前，查看对数大于12的数据量如下，共有32条数据，将这些离群点删除处理

In [None]:
data_train[data_train["MonthlyIncome"]>np.e**12].shape[0]

In [None]:
data_train = data_train[data_train["MonthlyIncome"]<np.e**12]
data_train[data_train["MonthlyIncome"]>np.e**12].shape[0]

### 3.2.10三种逾期情况分析

从特征分布中可以看到逾期30-59笔数、逾期60-89笔数、逾期90天笔数的最大值都是98，常识来说显然这是异常情况，没有人能够有这么多逾期贷款。画出三者的箱图，可以看到都存在90笔以上的逾期数据。

In [None]:
x1=data_train['NumberOfTime30-59DaysPastDueNotWorse']
x2=data_train['NumberOfTime60-89DaysPastDueNotWorse']
x3=data_train['NumberOfTimes90DaysLate']
fig=plt.figure(figsize=(20,15))
ax=fig.add_subplot(221)
ax.boxplot([x1,x2,x3])
ax.set_xticklabels(["30-59d num","60-89d num","90+d num"], fontsize=20)

统计三者90以上的数据：

In [None]:
data_train[data_train['NumberOfTime30-59DaysPastDueNotWorse']>90].shape

In [None]:
data_train[data_train['NumberOfTime60-89DaysPastDueNotWorse']>90].shape

In [None]:
data_train[data_train['NumberOfTimes90DaysLate']>90].shape

这三种情况的数据维度相同，都是（269，12），猜测这三个异常指标出现在相同的行

In [None]:
data_train[(data_train['NumberOfTime30-59DaysPastDueNotWorse']>90) & (data_train['NumberOfTime60-89DaysPastDueNotWorse']>90) & (data_train['NumberOfTimes90DaysLate']>90)].shape

可以看到数据的维度仍为（269，12），将这些异常行删掉。

In [None]:
data_train = data_train[data_train["NumberOfTime30-59DaysPastDueNotWorse"]<=90]
data_train[data_train["NumberOfTime30-59DaysPastDueNotWorse"]>90].shape[0]

## 3.3 相关性分析 

查看各个特征与目标值SeriousDlqin2yrs的相关性

In [None]:
corr_matrix = data_train.corr()
print(corr_matrix["SeriousDlqin2yrs"].sort_values(ascending=False))

与目标量SeriousDlqin2yrs相关性排名前四的特征为：
* NumberOfTime90Dayslate 
* RevolvingUtilizationOfUnsecuredLines
* NumberOfTime30-59DaysPastDueNotWorse
* NumberOfTime60-89DaysPastDueNotWorse

画出相关性热力图

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
mask = np.tril(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix,xticklabels=corr_matrix.columns,yticklabels=corr_matrix.columns,cmap='Blues',annot=True,mask = mask)

从结果可以看到，在SeriousDlqin2yrs行，RevolvingUtilizationOfUnsecuredLines、NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTime90Dayslate的颜色较深，表示这四个特征与目标值相关性很大。有过违约记录的人很可能会发生逾期90天以上不还款的违约行为，这与我们的常识也是一致的。信用卡可用额度比值越大，也就是可用信用额度越少，越容易没有钱偿还贷款，越容易发生违约行为。

# 4.变量分箱

分箱离散化后,可以降低异常值的影响。分箱有等宽分箱、等频分箱、最优分箱。

计算Woe(Weight of Evidence，常用在风险评估、授信评分卡等领域)和IV(Information value，可通过woe加权求和得到，衡量自变量对应变量的预测能力)

In [None]:
## 划分自变量因变量
X_train = data_train.iloc[:,2:]
Y_train = data_train.iloc[:,1]
X_test = data_test.iloc[:,2:]

## 4.1 最优分箱

连续变量使用最优分箱

In [None]:
import scipy.stats as stats

def monoto_bin(Y, X, n):
    r = 0
    total_good = Y.sum()
    total_bad =Y.count()-total_good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.min().X, columns = ['min_' + X.name])
    d3['min_' + X.name] = d2.min().X
    d3['max_' + X.name] = d2.max().X
    d3[Y.name] = d2.sum().Y
    d3['total'] = d2.count().Y
    
    #woe
    d3['goodattr']=d3[Y.name]/total_good
    d3['badattr']=(d3['total']-d3[Y.name])/total_bad
    d3['woe'] = np.log(d3['goodattr']/d3['badattr'])
    
    #IV
    iv = ((d3['goodattr']-d3['badattr'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min_' + X.name)).reset_index(drop = True)
    print ("=" * 80)
    print (d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1,n+1):
        qua =X.quantile(i/(n+1))
        cut.append(round(qua,4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4,iv,cut,woe

In [None]:
x1_d,x1_iv,x1_cut,x1_woe = monoto_bin(Y_train,X_train.RevolvingUtilizationOfUnsecuredLines,10)
x2_d,x2_iv,x2_cut,x2_woe = monoto_bin(Y_train,X_train.age,10)
x4_d,x4_iv,x4_cut,x4_woe = monoto_bin(Y_train,X_train.DebtRatio,10)
x5_d,x5_iv,x5_cut,x5_woe = monoto_bin(Y_train,X_train.MonthlyIncome,10)

## 4.2 离散变量分箱

对离散型变量进行手动设置边界进行分箱

In [None]:
##定义分箱函数
def binning(X,Y,CUT):
    
    d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,CUT)})
    d2=d1.groupby('Bucket', as_index = True)

    d3 = pd.DataFrame({"min":d2.min().X})
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y

    ## 计算woe iv
    good=Y.sum()
    bad=Y.count()-good
    d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_values(by='min'))
    print("=" * 60)
    print(d4)
    woe = list(d4['woe'].round(3))
    return d4, iv,woe

In [None]:
cutx3 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx6 = [-np.inf, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, np.inf]
cutx7 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx8 = [-np.inf, 0, 1, 2, 3, 4, 5, np.inf]
cutx9 = [-np.inf, 0, 1, 2, 4, 8, np.inf]
cutx10 = [-np.inf, 0, 1, 2, 3, 4, 5, 6, np.inf]

##分箱
x3_d,x3_iv,x3_woe = binning(X_train["NumberOfTime30-59DaysPastDueNotWorse"],Y_train,cutx3)
x6_d,x6_iv,x6_woe = binning(X_train["NumberOfOpenCreditLinesAndLoans"],Y_train,cutx6)
x7_d,x7_iv,x7_woe = binning(X_train["NumberOfTimes90DaysLate"],Y_train,cutx7)
x8_d,x8_iv,x8_woe = binning(X_train["NumberOfOpenCreditLinesAndLoans"],Y_train,cutx8)
x9_d,x9_iv,x9_woe = binning(X_train["NumberOfTime60-89DaysPastDueNotWorse"],Y_train,cutx9)
x10_d,x10_iv,x10_woe = binning(X_train["NumberOfDependents"],Y_train,cutx10)

## 4.3 各个变量IV值

In [None]:
# 查看各个变量的IV值
informationValue = []
informationValue.append(x1_iv)
informationValue.append(x2_iv)
informationValue.append(x3_iv)
informationValue.append(x4_iv)
informationValue.append(x5_iv)
informationValue.append(x6_iv)
informationValue.append(x7_iv)
informationValue.append(x8_iv)
informationValue.append(x9_iv)
informationValue.append(x10_iv)
informationValue
plt.figure(figsize=(20, 10)) 
index=['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse',
       'DebtRatio','MonthlyIncome','NumberOfOpenCreditLinesAndLoans','NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']
index_num = range(len(index))
ax=plt.barh(index_num,informationValue,tick_label=index)
plt.xticks(rotation=90) 
plt.show()

IV值可以用来衡量自变量对于因变量的响应能力，具体量化指标如下：
- $\leq$0.02:useless for prediction
- 0.02 to 0.1:Weak predictor
- 0.1 to 0.3Medium predictor
- 0.3 to 0.5Strong predictor 
- $\geq$0.5 Suspicious or too good to be true  
舍弃掉衡量能力较差的特征



In [None]:
X_train_last = X_train.drop(["NumberOfDependents"],axis = 1)
X_test_last = X_test.drop(["NumberOfDependents"],axis = 1)
X_test_last.head()

# 5.模型预测

ROC曲线绘制函数

In [None]:
# ROC曲线绘制
def draw_roc(FPR, TPR, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(FPR, TPR,'b', linewidth=2, label=label)
    plt.plot([0,1],[0,1], "r--") 
    plt.xlim([0, 1])   
    plt.ylim([0, 1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## 5.1逻辑回归

In [None]:
from sklearn.linear_model import LogisticRegressionCV
logit = LogisticRegressionCV(class_weight='balanced')
logit.fit(X_train_last, Y_train)
logit_scores_proba = logit.predict_proba(X_train_last)
logit_scores = logit_scores_proba[:,1]
FPR_logit, TPR_logit, THRESH_logit = roc_curve(Y_train, logit_scores)
AUC_logit=roc_auc_score(Y_train,logit_scores)
draw_roc(FPR_logit, TPR_logit)
print("logit-AUC: {:.5f}%".format(AUC_logit*100))

## 5.2 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=300, max_depth=5, class_weight='balanced')
forest.fit(X_train_last, Y_train)
forest_scores_proba = forest.predict_proba(X_train_last)
forest_scores = forest_scores_proba[:,1]
FPR_forest, TPR_forest, THRESH_forest = roc_curve(Y_train, forest_scores)
AUC_forest=roc_auc_score(Y_train,forest_scores)
draw_roc(FPR_forest, TPR_forest)
print("RF-AUC: {:.5f}%".format(AUC_forest*100))

## 5.3 AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ABC = AdaBoostClassifier()
ABC.fit(X_train_last, Y_train)
ABC_scores_proba =ABC.predict_proba(X_train_last)
ABC_scores = ABC_scores_proba[:,1]
FPR_ABC, TPR_ABC, THRESH_ABC = roc_curve(Y_train, ABC_scores)
AUC_ABC=roc_auc_score(Y_train,ABC_scores)
draw_roc(FPR_ABC, TPR_ABC)
print("ABC-AUC: {:.5f}%".format(AUC_ABC*100))

## 5.4 GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC = GradientBoostingClassifier()
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC-AUC: {:.5f}%".format(AUC_GBC*100))

梯度提升算法效果最好，选择对梯度提升算法参数进行优化。

# 6.参数优化

## 6.1 n_estimators 

In [None]:
estimators=np.linspace(100,300,21).astype(int)
AUCs=[]
nums=[]
for estimator in estimators:
    GBC = GradientBoostingClassifier(n_estimators=estimator, learning_rate=0.1 ,max_depth=4)
    GBC.fit(X_train_last, Y_train)
    GBC_scores_proba =GBC.predict_proba(X_train_last)
    GBC_scores = GBC_scores_proba[:,1]
    FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
    AUC_GBC=roc_auc_score(Y_train,GBC_scores)
    AUCs.append(AUC_GBC*100)
    nums.append(estimator)
plt.plot(nums,AUCs)

estimator取250

## 6.2 max_depth

In [None]:
depths = np.linspace(1,12,12).astype(int)
AUCs=[]
maxdepths=[]
for depth in depths:
    GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth=depth)
    GBC.fit(X_train_last, Y_train)
    GBC_scores_proba =GBC.predict_proba(X_train_last)
    GBC_scores = GBC_scores_proba[:,1]
    FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
    AUC_GBC=roc_auc_score(Y_train,GBC_scores)
    AUCs.append(AUC_GBC*100)
    maxdepths.append(depth)
plt.plot(maxdepths,AUCs)

maxdepths取6

通过10-折交叉验证估计GBC模型的准确率

In [None]:
GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth = 6)
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC-AUC: {:.5f}%".format(AUC_GBC*100))

In [None]:
AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("GBC-10折交叉验证-AUC：{:.5f}%".format(AUC_GBC_cv*100))

过拟合，所以将学习率设为0.05

In [None]:
GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.05 ,max_depth = 6)
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC-AUC: {:.5f}%".format(AUC_GBC*100))

In [None]:
AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("GBC-10折交叉验证-AUC：{:.5f}%".format(AUC_GBC_cv*100))

# 7.提交结果

In [None]:
submission_proba = GBC.predict_proba(X_test_last)
submission_scores = submission_proba[:, 1]
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission_gbc.csv', index=False)