In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **一.准备工作**
## 1.1导入相关库

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## 1.2读取数据

In [None]:
data_train = pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
data_test = pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")
#读取前5行数据
data_train.head()

## 1.3 数据含义
#### SeriousDlqin2yrs:是否有超过90天或更严重的贷款拖欠问题
#### RevolvingUtilizationOfUnsecuredLines：除去房贷车贷的贷款金额/信用卡总贷款额度
#### age：借款人年龄
#### NumberOfTime30-59DaysPastDueNotWorse:过去两年内30-59天的贷款拖欠问题的次数
#### DebtRatio：负债率（生活花费/总收入）
#### MonthlyIncome:月收入
#### NumberOfOpenCreditLinesAndLoans：开放式信贷和贷款数量，开放式贷款（分期付款如汽车贷款或抵押贷款）和信贷（如信用卡）的数量
#### NumberOfTime60-89DaysPastDueNotWorse:过去两年内60-89天的贷款拖欠问题的次数
#### NumberOfDependents:家属人数（除去自己）

## 1.4描述性统计

In [None]:
# data_train.info()

后续创建特征时可能需要考虑int和float混杂这一问题

In [None]:
# data_train.describe()

RevolvingUtilizationOfUnsecuredLines 和 DebtRatio 最大值都远大于1，age 最小值为0，应该都有异常值

MonthlyIncome 和 NumberOfDependents 不足150000，有缺失值

大体上感觉许多变量都是长尾分布


具体观察一下各变量分布

In [None]:
# plt.figure(figsize=(15,20))
# for i in range(2,12):
#     curax = plt.subplot(6,2,i-1)
#     sns.distplot(data_train.iloc[:,i],ax=curax)

除了age基本都是长尾分布，但异常值问题严重

观察要预测的变量SeriousDlqin2yrs分布

In [None]:
# # 观察预测变量SeriousDlqin2yrs分布
# sns.countplot(x="SeriousDlqin2yrs",data=data_train)

发现数据不平衡，后续建模预测需要注意

# **二.数据清理**

列名重命名

In [None]:
#列名重命名
colnames={'Unnamed: 0':'ID',
         'SeriousDlqin2yrs':'Isdlq',
         'RevolvingUtilizationOfUnsecuredLines':'Revol',
         'NumberOfTime30-59DaysPastDueNotWorse':'Num30-59late',
         'NumberOfOpenCreditLinesAndLoans':'Numopen',
         'NumberOfTimes90DaysLate':'Num90late',
         'NumberRealEstateLoansOrLines':'Numestate',
         'NumberOfTime60-89DaysPastDueNotWorse':'Num60-89late',
         'NumberOfDependents':'Numdepend'}
data_train.rename(columns=colnames,inplace=True)
data_test.rename(columns=colnames,inplace=True)
data_train.head()

## 2.1异常值处理

异常值的处理会影响后续缺失值填充，遂先逐个分析异常值

### 2.1.1 Age


In [None]:
# #Age数据分布情况
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['age'],ax=ax1)
# ax1.set_xlim(-10,120)
# ax1.set_title('train')
# sns.distplot(data_test['age'],ax=ax3)
# ax3.set_xlim(-10,120)
# ax3.set_title('test')
# sns.boxplot(y='age',data=data_train,ax=ax2)
# ax2.set_ylim(-10,120)
# sns.boxplot(y='age',data=data_test,ax=ax4)
# ax4.set_ylim(-10,120)

年龄基本符合正态分布，但需要去除异常值，比较三倍标准差和箱型图异常值阈值

In [None]:
#三倍标准差
age_mean=data_train['age'].mean()
age_std=data_train['age'].std()
age_lowlimit_std=age_mean-3*age_std
age_uplimit_std=age_mean+3*age_std
print('异常值下限：',age_lowlimit_std,'异常值上限：',age_uplimit_std)
age_lowlimitd=data_train.loc[data_train['age']<age_lowlimit_std,:]
age_uplimitd=data_train.loc[data_train['age']>age_uplimit_std,:]
print('异常值下限比例：{0}%'.format(age_lowlimitd.shape[0]*100/data_train.shape[0]),
     '异常值上限比例：{0}%'.format(age_uplimitd.shape[0]*100/data_train.shape[0]))

In [None]:
# #箱型图 IQR
# age_1q,age_3q = data_train['age'].quantile([0.25,0.75])
# age_iqr = age_3q - age_1q
# age_lowlimit_iqr = age_1q - 1.5*age_iqr
# age_uplimit_iqr = age_3q + 1.5*age_iqr
# print('异常值下限：',age_lowlimit_iqr,'异常值上限：',age_uplimit_iqr)
# age_lowlimitd=data_train.loc[data_train['age']<age_lowlimit_iqr,:]
# age_uplimitd=data_train.loc[data_train['age']>age_uplimit_iqr,:]
# print('异常值下限比例：{0}%'.format(age_lowlimitd.shape[0]*100/data_train.shape[0]),
#      '异常值上限比例：{0}%'.format(age_uplimitd.shape[0]*100/data_train.shape[0]))

发现对于下限数据删除基本无区别，为保留尽可能多的数据，按三倍标准差剔除异常值

In [None]:
data_train = data_train[data_train["age"]>=age_lowlimit_std]
data_train = data_train[data_train["age"]<=age_uplimit_std]

In [None]:
# data_age=data_train.loc[data_train['age']>0,['age','Isdlq']]
# data_age.loc[(data_age['age']>18)&(data_age['age']<40),'age'] = 1
# data_age.loc[(data_age['age']>=40)&(data_age['age']<60),'age'] = 2 
# data_age.loc[(data_age['age']>=60)&(data_age['age']<80),'age'] = 3
# data_age.loc[(data_age['age']>=80),'age'] = 4
# age_Isdlq=data_age.groupby('age')['Isdlq'].sum()
# age_total=data_age.groupby('age')['Isdlq'].count()
# age_Isratio=age_Isdlq/age_total
# age_Isratio.plot(kind='bar',figsize=(8,6),color='#4682B4')

分组分析发现年龄越高，违约率越低

### 2.1.2 RevolvingUtilizationOfUnsecuredLines（Revol）


由探索性分析中分布图可知，Revol尾过长，使用对数坐标轴

In [None]:
# #Revol数据分布情况
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['Revol'],ax=ax1,bins = [0.001,0.01,0.1,1,10,100,1000])
# ax1.set_xscale('log')
# ax1.set_title('train')
# sns.distplot(data_test['Revol'],ax=ax3,bins = [0.001,0.01,0.1,1,10,100,1000])
# ax3.set_title('test')
# ax3.set_xscale('log')
# sns.boxplot(y='Revol',data=data_train,ax=ax2)
# ax2.set_yscale('log')
# sns.boxplot(y='Revol',data=data_test,ax=ax4)
# ax4.set_yscale('log')

按理说Revol应该在0-1之间，上图也表明大部分数据确实如此，但考虑到银行实际操作中的问题，大于1也不无可能。现在的关键问题是要如何设置异常值阈值。

由于极端值太大，对标准差影响很大，看一下箱型图给出的阈值

In [None]:
# #箱型图 IQR
# Revol_1q,Revol_3q = data_train['Revol'].quantile([0.25,0.75])
# Revol_iqr = Revol_3q - Revol_1q
# Revol_lowlimit_iqr = Revol_1q - 1.5*Revol_iqr
# Revol_uplimit_iqr = Revol_3q + 1.5*Revol_iqr
# print('异常值下限：',Revol_lowlimit_iqr,'异常值上限：',Revol_uplimit_iqr)
# Revol_lowlimitd=data_train.loc[data_train['Revol']<Revol_lowlimit_iqr,:]
# Revol_uplimitd=data_train.loc[data_train['Revol']>Revol_uplimit_iqr,:]
# print('异常值下限比例：{0}%'.format(Revol_lowlimitd.shape[0]*100/data_train.shape[0]),
#      '异常值上限比例：{0}%'.format(Revol_uplimitd.shape[0]*100/data_train.shape[0]))
# sns.countplot(x="Isdlq",data=Revol_uplimitd)

如描述统计中所示，负例很少，应尽可能珍惜，不确定按照箱型图删除离群值是否损失太多负例，进一步分析大于1的数据。

In [None]:
# #将数据分为两部分，小于1和大于1的部分
# data1=data_train.loc[data_train['Revol']<1,:]
# data2=data_train.loc[data_train['Revol']>=1,:]
# #看一下两部分数据分布情况
# fig=plt.figure(figsize=(20,6))
# ax1=fig.add_subplot(1,2,1)
# ax2=fig.add_subplot(1,2,2)
# sns.distplot(data1['Revol'],ax=ax1,bins=1000)
# sns.distplot(data2['Revol'],ax=ax2,bins=10000)
# #ax1.set_xscale('log')
# ax2.set_xscale('log')

可见大于1的数据绝大多数都在100乃至10以内，进一步分析。

In [None]:
# fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(2,2,figsize=(20,10))
# sns.distplot(data_train.loc[(data_train['Revol']>=1)&(data_train['Revol']<10),'Revol'],bins=100,ax=ax1)
# sns.distplot(data_train.loc[(data_train['Revol']>=1)&(data_train['Revol']<100),'Revol'],bins=100,ax=ax2)
# sns.distplot(data_train.loc[(data_train['Revol']>=10)&(data_train['Revol']<100),'Revol'],bins=100,ax=ax3)
# sns.distplot(data_train.loc[data_train['Revol']>=100,'Revol'],bins=100,ax=ax4)

似乎可以将范围进一步缩小到3以内，进一步分析违约率和Revol的关系来确定阈值

In [None]:
# #将区间分为（0.667-1），(1-1.333),（1.333-1.667），（1.667-2），（2-2.333），（2.333-2.667），（2.667-3），
# #（3-10），（10-100）看一下违约率情况
# data_1=data_train.loc[(data_train['Revol']>=0.667)&(data_train['Revol']<1),:]
# Is_1=data_1.loc[data_1['Isdlq']==1,:].shape[0]*100/data_1.shape[0]

# data_2=data_train.loc[(data_train['Revol']>=1)&(data_train['Revol']<1.333),:]
# Is_2=data_2.loc[data_2['Isdlq']==1,:].shape[0]*100/data_2.shape[0]

# data_3=data_train.loc[(data_train['Revol']>=1.333)&(data_train['Revol']<1.667),:]
# Is_3=data_3.loc[data_3['Isdlq']==1,:].shape[0]*100/data_3.shape[0]

# data_4=data_train.loc[(data_train['Revol']>=1.667)&(data_train['Revol']<2),:]
# Is_4=data_4.loc[data_4['Isdlq']==1,:].shape[0]*100/data_4.shape[0]

# data_5=data_train.loc[(data_train['Revol']>=2)&(data_train['Revol']<2.333),:]
# Is_5=data_5.loc[data_5['Isdlq']==1,:].shape[0]*100/data_5.shape[0]

# data_6=data_train.loc[(data_train['Revol']>=2.333)&(data_train['Revol']<2.667),:]
# Is_6=data_6.loc[data_6['Isdlq']==1,:].shape[0]*100/data_6.shape[0]

# data_7=data_train.loc[(data_train['Revol']>=2.667)&(data_train['Revol']<3),:]
# Is_7=data_7.loc[data_7['Isdlq']==1,:].shape[0]*100/data_7.shape[0]

# data_8=data_train.loc[(data_train['Revol']>=3)&(data_train['Revol']<10),:]
# Is_8=data_8.loc[data_8['Isdlq']==1,:].shape[0]*100/data_8.shape[0]

# data_9=data_train.loc[(data_train['Revol']>=10)&(data_train['Revol']<100),:]
# Is_9=data_9.loc[data_9['Isdlq']==1,:].shape[0]*100/data_9.shape[0]

# print('0.667-1违约率为：{0}%'.format(Is_1),
#      '1-1.333违约率为：{0}%'.format(Is_2),
#      '1.333-1.667违约率为：{0}%'.format(Is_3), 
#      '1.667-2违约率为：{0}%'.format(Is_4),
#      '2-2.333违约率为：{0}%'.format(Is_5),
#      '2.333-2.667违约率为：{0}%'.format(Is_6),
#      '2.667-3违约率为：{0}%'.format(Is_7),
#      '3-10违约率为：{0}%'.format(Is_8),
#      '10-100违约率为：{0}%'.format(Is_9))

# print('2以上比例：{0}%'.format(data_train.loc[data_train['Revol']>2,:].shape[0]*100/data_train.shape[0]))

可见在1-2范围内不仅包含了绝大部分数据，而且在此范围内Revol越大违约率越高，与Revol变量内涵相一致，而大于2之后Revol越高违约率反而下降。大于2的数据占0.24%，相比于箱型图方法其实没有多抢救下很多数据。但再扩大保留范围不太合适。


(后续改进中发现对于偏态分布进行对数转化是更好的分析办法，如下图所示，也表明以2为阈值较好）

In [None]:
# sns.kdeplot(data_train['Revol'].apply(np.log1p), label='Train',
#                 color='steelblue', alpha=0.5, shade=True, edgecolor='k')
# plt.xlabel('log-{}'.format('Revol'))
# plt.xticks(range(12))
# np.e**1-1

综上结果，删除Revol大于2的数据。

In [None]:
data_train = data_train[data_train["Revol"]<=2]

### 2.1.3 DebtRatio



In [None]:
# #数据分布情况
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['DebtRatio'],ax=ax1,bins = [0.0001,0.001,0.01,0.1,1,10,100,1000,10**4,10**5,10**6])
# ax1.set_xscale('log')
# ax1.set_title('train')
# sns.distplot(data_test['DebtRatio'],ax=ax3,bins = [0.0001,0.001,0.01,0.1,1,10,100,1000,10**4,10**5,10**6])
# ax3.set_title('test')
# ax3.set_xscale('log')
# sns.boxplot(y='DebtRatio',data=data_train,ax=ax2)
# ax2.set_yscale('log')
# sns.boxplot(y='DebtRatio',data=data_test,ax=ax4)
# ax4.set_yscale('log')

可见负债率与Revol情况较为类似，采取相似的方法处理。

In [None]:
# #箱型图 IQR
# DebtRatio_1q,DebtRatio_3q = data_train['DebtRatio'].quantile([0.25,0.75])
# DebtRatio_iqr = DebtRatio_3q - DebtRatio_1q
# DebtRatio_lowlimit_iqr = DebtRatio_1q - 1.5*DebtRatio_iqr
# DebtRatio_uplimit_iqr = DebtRatio_3q + 1.5*DebtRatio_iqr
# print('异常值下限：',DebtRatio_lowlimit_iqr,'异常值上限：',DebtRatio_uplimit_iqr)
# DebtRatio_lowlimitd=data_train.loc[data_train['DebtRatio']<DebtRatio_lowlimit_iqr,:]
# DebtRatio_uplimitd=data_train.loc[data_train['DebtRatio']>DebtRatio_uplimit_iqr,:]
# print('异常值下限比例：{0}%'.format(DebtRatio_lowlimitd.shape[0]*100/data_train.shape[0]),
#      '异常值上限比例：{0}%'.format(DebtRatio_uplimitd.shape[0]*100/data_train.shape[0]))

箱型图鉴定出异常值竟然达到20%，这显然不合理，进行对数转化进一步分析。

In [None]:
# sns.kdeplot(data_train['DebtRatio'].apply(np.log1p), label='Train',
#                 color='steelblue', alpha=0.5, shade=True, edgecolor='k')
# plt.xlabel('log-{}'.format('Revol'))
# plt.xticks(range(15))
# (np.e**9.1)-1

可见被箱型图视为异常值的20%分布在e^1-e^9左右的长尾内

In [None]:
debtthreshold = (np.e**9.1)-1
print('阈值以上比例：{0}%'.format(data_train[data_train["DebtRatio"]>debtthreshold].shape[0]*100/data_train.shape[0]))
sns.countplot(x="Isdlq",data=data_train.loc[data_train['DebtRatio']>debtthreshold,:])

e^9.1之外数据占比很小，直接删除

In [None]:
data_train = data_train[data_train['DebtRatio']<debtthreshold]

### 2.1.4 Pastdue

这三者含义类似，放在一起分析

In [None]:
# # 查看一下三者的箱型图
# plt.figure(figsize=(15,6)) 
# data_train[['Num30-59late', 
#           'Num60-89late',
#           'Num90late']].boxplot()
# plt.show()

In [None]:
# fig,[ax1,ax2,ax3]=plt.subplots(3,1,figsize=(15,10))

# sns.distplot(data_train['Num30-59late'],ax=ax1)
# sns.distplot(data_train['Num60-89late'],ax=ax2)
# sns.distplot(data_train['Num90late'],ax=ax3)

可见这三个变量模式非常相似，且显然90以上为离群值

In [None]:
data_train = data_train[data_train['Num30-59late']<90]
data_train = data_train[data_train['Num60-89late']<90]
data_train = data_train[data_train['Num90late']<90]

### 2.1.5 NumberOfOpenCreditLinesAndLoans(Numopen)

In [None]:
# # 观察分布形态
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['Numopen'],ax=ax1)
# ax1.set_title('train')
# sns.distplot(data_test['Numopen'],ax=ax3)
# ax3.set_title('test')
# sns.boxplot(y='Numopen',data=data_train,ax=ax2)
# sns.boxplot(y='Numopen',data=data_test,ax=ax4)

In [None]:
# #观察对数转化后是否有明显离群值
# fig,[ax1,ax2]=plt.subplots(1,2,figsize=(15,5))

# sns.distplot(data_train['Numopen'].apply(np.log1p),ax=ax1)
# ax1.set_title('train')
# ax1.set_xlabel('log-Numopen')
# sns.distplot(data_test['Numopen'].apply(np.log1p),ax=ax2)
# ax3.set_title('test')
# ax3.set_xlabel('log-Numopen')

综上考虑认为数据本身为偏态分布，无需删除处理

### 2.1.6 NumberRealEstateLoansOrLines(Numestate)


In [None]:
# # 观察对数转化后分布形态
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['Numestate'].apply(np.log1p),ax=ax1)
# ax1.set_title('train')
# ax1.set_xlabel('log-Numestate')
# sns.distplot(data_test['Numestate'].apply(np.log1p),ax=ax3)
# ax3.set_title('test')
# ax3.set_xlabel('log-Numestate')
# sns.boxplot(y='Numestate',data=data_train,ax=ax2)
# sns.boxplot(y='Numestate',data=data_test,ax=ax4)


依旧是连续的长尾分布，尽可能多保存信息，只将大于50的删去

In [None]:
data_train = data_train[data_train['Numestate']<50]

### 2.1.7 NumberOfDependents(Numdepend)

In [None]:
# # 观察对数转化后分布形态
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['Numdepend'].apply(np.log1p),ax=ax1)
# ax1.set_title('train')
# ax1.set_xlabel('log-Numdepend')
# sns.distplot(data_test['Numdepend'].apply(np.log1p),ax=ax3)
# ax3.set_title('test')
# ax3.set_xlabel('log-Numdepend')
# sns.boxplot(y='Numdepend',data=data_train,ax=ax2)
# sns.boxplot(y='Numdepend',data=data_test,ax=ax4)

同样是类似的情况，仅将15以上的删去

In [None]:
#注意Numdepend含有缺失值，直接选择<15会将缺失值也删掉
data_train = data_train[np.logical_or( data_train['Numdepend']<15 , np.isnan(data_train['Numdepend']) )]

### 2.1.8 MonthlyIncome

In [None]:
# # 观察对数转化后分布形态
# fig,[(ax1,ax2),(ax3,ax4)]=plt.subplots(2,2,figsize=(15,10))

# sns.distplot(data_train['MonthlyIncome'].apply(np.log1p),ax=ax1)
# ax1.set_title('train')
# ax1.set_xlabel('log-MonthlyIncome')
# sns.distplot(data_test['MonthlyIncome'].apply(np.log1p),ax=ax3)
# ax3.set_title('test')
# ax3.set_xlabel('log-MonthlyIncome')
# sns.boxplot(y='MonthlyIncome',data=data_train,ax=ax2)
# sns.boxplot(y='MonthlyIncome',data=data_test,ax=ax4)

同样类似，删去1*10^6以上的

In [None]:
data_train = data_train[np.logical_or( data_train['MonthlyIncome']<1000000 , np.isnan(data_train['MonthlyIncome']) )]

## 2.2缺失值处理

根据描述性统计，训练集和测试集在NumberOfDependents和MonthlyIncome上有空缺值；

下面分别进行分析

In [None]:
# data_train.isnull().sum()

### 2.2.1 NumberOfDependents

从上表中可以发现NumberOfDependents缺失值较少（3924/150000$\approx$2.6%）  
所以考虑直接使用fillna函数，用中位数填充缺失值。
先对其进行处理，有利于后续对月收入的预测填充

In [None]:
data_train['Numdepend'].fillna(data_train['Numdepend'].median(), inplace=True)
data_test['Numdepend'].fillna(data_test['Numdepend'].median(), inplace=True)
# data_test.info()

### 2.1.1 MonthlyIncome
从上表中可以发现MonthlyIncome缺失值较多（29731/150000$\approx$19.8%）  
所以不能直接删除含有缺失值的样本，考虑填充缺失值。  
因为随机森林不容易过拟合且对于有大量缺失值的数据能进行有效的估计与处理，所以此处我们使用随机森林进行回归预测，

In [None]:
from sklearn.ensemble import RandomForestRegressor

#先把所有数值特征提取出来 其中第零列是id不要
data_randomforest = data_train.iloc[:,[1,2,3,4,5,6,7,8,9,10,11]]

##把整个数据按MonthlyIncome是否为空缺分为两部分
known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

##确定随机森林的训练集和试验集
#注意因为known的定义里就没取第零列，所以这里对应的列数都要减一
X_know = known[:,[0,1,2,3,5,6,7,8,9,10]]
Y_know = known[:,[5]]
X_forpredict = unknown[:,[0,1,2,3,4,6,7,8,9,10]]

简单调参，确定最佳参数

In [None]:
# estimators=np.linspace(100,300,21).astype(int)
# scores=[]
# nums=[]
# for estimator in estimators:
#     rfr = RandomForestRegressor(random_state=0,n_estimators=estimator,max_depth=3,n_jobs=-1)
#     rfr.fit(X_know,Y_know)
#     scores.append(rfr.score(X_know,Y_know)*100)
#     nums.append(estimator)
# plt.plot(nums,scores)

可见estimator为260效果最好

In [None]:
# max_depths=np.linspace(1,10,10).astype(int)
# scores=[]
# nums=[]
# for max_depth in max_depths:
#     rfr = RandomForestRegressor(random_state=0,n_estimators=260,max_depth=max_depth,n_jobs=-1)
#     rfr.fit(X_know,Y_know)
#     scores.append(rfr.score(X_know,Y_know)*100)
#     nums.append(max_depth)
# plt.plot(nums,scores)

可见逐渐增长至6左右开始基本平稳。

In [None]:
##训练随机森林模型
rfr = RandomForestRegressor(random_state=0,n_estimators=260,max_depth=6,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值,原数据中收入都是整数，所以这里保留0位小数
Y_predicted = rfr.predict(X_forpredict).round(0)

##回填
data_train.loc[(data_train.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted

##看一下有没有成功
data_train.info()

对训练集也做类似填充，只需注意训练集中SeriousDlqin2yrs也是空缺值，所以随机森林中传入的变量要少一个

In [None]:
#先把所有数值特征提取出来 其中第零列是id不要,最后一列和第五列还有缺失值未处理先不要
data_randomforest = data_test.iloc[:,[2,3,4,5,6,7,8,9,10,11]]

##把整个数据按MonthlyIncome是否为空缺分为两部分
known = data_randomforest[data_randomforest.MonthlyIncome.notnull()].values
unknown = data_randomforest[data_randomforest.MonthlyIncome.isnull()].values

##确定随机森林的训练集和试验集
#注意因为known的定义里没取第零和第一列，所以这里对应的列数都要减二
X_know = known[:,[0,1,2,3,5,6,7,8,9]]
Y_know = known[:,[4]]
X_forpredict = unknown[:,[0,1,2,3,5,6,7,8,9]]
##训练随机森林模型
rfr = RandomForestRegressor(random_state=0,n_estimators=260,max_depth=6,n_jobs=-1)
rfr.fit(X_know,Y_know)

##预测缺失值,原数据中收入都是整数，所以这里保留0位小数
Y_predicted = rfr.predict(X_forpredict).round(0)

##回填
data_test.loc[(data_test.MonthlyIncome.isnull()), 'MonthlyIncome'] = Y_predicted

##看一下有没有成功
data_test.info()

## 2.3重复值处理
直接删除重复值即可

In [None]:
data_train = data_train.drop_duplicates()
data_train.info()
#事实上没有重复值

## 2.4  查看处理后的结果

In [None]:
# data_train.describe()

In [None]:
# #大致数据分布情况
# data_train.hist(bins=50, figsize=(20,15))
# plt.show()

In [None]:
# # 检查数据的相关性
# corr = data_train.corr()
# plt.figure(figsize=(20, 15))
# sns.heatmap(corr, annot=True, fmt='.2g')

从相关系数图中可以看出因变量SeriousDlqin2yrs与
- RevolvingUtilizationOfUnsecuredLines
- NumberOfTime30-59DaysPastDueNotWorse
- NumberOfTime60-89DaysPastDueNotWorse
- NumberOfTime90Dayslate  
相关性较强

# 三.**变量处理**

## 3.1 构造新特征
- 通过负债率和月收入可以计算出月花费
- 三个延迟天数变量模式类似，彼此相关，且都与因变量相关较高，尝试加总构造新变量
- 计算人均月收入（平均给自己和家人子女等）
- 计算人均月花费（可能大手大脚的消费模式更可能违约？）
- 计算贷款总数（开放贷款+车贷房贷）
- 某种程度上人和贷款很类似，都是每个月要固定花钱，计算（人+贷款）均月收入

In [None]:
data_train['Monthlypayment']=data_train['MonthlyIncome']*data_train['DebtRatio']
data_train['AllNumlate']=data_train['Num30-59late']+data_train['Num60-89late']+data_train['Num90late']
data_train['incomperhuman']=data_train['MonthlyIncome']/(data_train['Numdepend']+1)
data_train['payperhuman']=data_train['Monthlypayment']/(data_train['Numdepend']+1)
data_train['AllNumloan']=data_train['Numopen']+data_train['Numestate']
data_train['incomperchannel']=data_train['MonthlyIncome']/(data_train['AllNumloan']+data_train['Numdepend']+1)

In [None]:
data_test['Monthlypayment']=data_test['MonthlyIncome']*data_test['DebtRatio']
data_test['AllNumlate']=data_test['Num30-59late']+data_test['Num60-89late']+data_test['Num90late']
data_test['incomperhuman']=data_test['MonthlyIncome']/(data_test['Numdepend']+1)
data_test['payperhuman']=data_test['Monthlypayment']/(data_test['Numdepend']+1)
data_test['AllNumloan']=data_test['Numopen']+data_test['Numestate']
data_test['incomperchannel']=data_test['MonthlyIncome']/(data_test['AllNumloan']+data_test['Numdepend']+1)

In [None]:
# # 再次检查数据的相关性
# corr = data_train.corr()
# plt.figure(figsize=(20, 15))
# sns.heatmap(corr, annot=True, fmt='.2g')

似乎除了AllNumlate都跟因变量相关很低，进一步分析来确定保留哪些。

## 3.2 特征选择

分箱离散化后,可以降低异常值的影响，在分箱后，进一步计算Woe和IV  

- Woe全称叫Weight of Evidence，常用在风险评估、授信评分卡等领域。

- IV全称是Information value，可通过woe加权求和得到，衡量自变量对应变量的预测能力。

通过IV来比较要保留哪些变量

In [None]:
#在分箱前保留原始结果，后续可能用到
ori_data_train = data_train

In [None]:
# #Revol分箱
# data_train.loc[(data_train['Revol']<1),'Revol']=0
# data_train.loc[(data_train['Revol']>1)&(data_train['Revol']<=2),'Revol']=1

# #DebtRatio分箱
# data_train.loc[(data_train['DebtRatio']<1),'DebtRatio']=0
# data_train.loc[(data_train['DebtRatio']>1)&(data_train['DebtRatio']<2),'DebtRatio']=1
# data_train.loc[(data_train['DebtRatio']>=2),'DebtRatio']=2

# #Num30-59late/Num60-89late/Num90late/Numestate/Numdepend
# data_train.loc[(data_train['Num30-59late']>=8), 'Num30-59late'] = 8
# data_train.loc[(data_train['Num60-89late']>=7), 'Num60-89late'] = 7
# data_train.loc[(data_train['Num90late']>=10), 'Num90late'] = 10
# data_train.loc[(data_train['Numestate']>=8), 'Numestate'] = 8
# data_train.loc[(data_train['Numdepend']>=7), 'Numdepend'] = 7

# #AllNumlate分箱
# data_train.loc[(data_train['AllNumlate']>1),'AllNumlate']=1#分为逾期和未逾期两种情况

# #AllNumloan分箱
# data_train.loc[(data_train['AllNumloan']>10), 'AllNumloan'] = 2
# data_train.loc[(data_train['AllNumloan']>5)&(data_train['AllNumloan']<=10),'AllNumloan']=1
# data_train.loc[(data_train['AllNumloan']>-1)&(data_train['AllNumloan']<=5),'AllNumloan']=0

In [None]:
# def bin_woe(tar, var, n=None, cat=None):
#     """
#     连续自变量分箱,woe,iv变换
#     tar:target目标变量
#     var:进行woe,iv转换的自变量
#     n:分组数量
#     """
#     total_bad = tar.sum()
#     total_good =tar.count()-total_bad
#     totalRate = total_good/total_bad
    
#     if cat == 's':
#         msheet = pd.DataFrame({tar.name:tar,var.name:var,'var_bins':pd.qcut(var, n, duplicates='drop')})
#         grouped = msheet.groupby(['var_bins'])
#     elif (cat == 'd') and (n is None):
#         msheet = pd.DataFrame({tar.name:tar,var.name:var})
#         grouped = msheet.groupby([var.name])
        
#     groupBad = grouped.sum()[tar.name]
#     groupTotal = grouped.count()[tar.name]
#     groupGood = groupTotal - groupBad
#     groupRate = groupGood/groupBad
#     groupBadRate = groupBad/groupTotal
#     groupGoodRate = groupGood/groupTotal

#     woe = np.log(groupRate/totalRate)
#     iv = np.sum((groupGood/total_good-groupBad/total_bad)*woe)
    
#     if cat == 's':
#         new_var, cut = pd.qcut(var, n, duplicates='drop',retbins=True, labels=woe.tolist())
#     elif cat == 'd':
#         dictmap = {}
#         for x in woe.index:
#             dictmap[x] = woe[x]
#         new_var, cut = var.map(dictmap), woe.index
    
#     return woe.tolist(), iv, cut, new_var

In [None]:
# # 确定变量类型，连续变量还是离散变量
# dvar = ['Revol','DebtRatio','Num30-59late', 'Num60-89late','Num90late','AllNumlate','Withdepend',
#         'Numestate','Numdepend','AllNumlate','AllNumloan']
# svar = ['MonthlyIncome','age','Monthlypayment','Numopen','incomperhuman','payperhuman','incomperchannel']


# # 可视化woe得分和iv得分
# def woe_vs(data):
#     cutdict = {}
#     ivdict = {}
#     woe_dict = {}
#     woe_var = pd.DataFrame()
#     for var in data.columns:
#         if var in dvar:
#             woe, iv, cut, new = bin_woe(data['Isdlq'], data[var], cat='d')
#             woe_dict[var] = woe
#             woe_var[var] = new
#             ivdict[var] = iv
#             cutdict[var] = cut
#         elif var in svar:
#             woe, iv, cut, new = bin_woe(data['Isdlq'], data[var], n=5, cat='s')
#             woe_dict[var] = woe
#             woe_var[var] = new
#             ivdict[var] = iv
#             cutdict[var] = cut
            
#     ivdict = sorted(ivdict.items(), key=lambda x:x[1], reverse=False)
#     iv_vs = pd.DataFrame([x[1] for x in ivdict],index=[x[0] for x in ivdict],columns=['IV'])
#     ax = iv_vs.plot(kind='barh',
#                     figsize=(12,12),
#                     title='Feature IV',
#                     fontsize=10,
#                     width=0.8,
#                     color='#00688B')
#     ax.set_ylabel('Features')
#     ax.set_xlabel('IV of Features')
#     ax.set_xticks(np.linspace(0,1.2,25))
    
#     return ivdict, woe_var, woe_dict, cutdict

# # woe转化
# ivinfo, woe_data, woe_dict, cut_dict = woe_vs(data_train)

IV值量化指标如下：

≤0.02:useless for prediction

0.02 to 0.1:Weak predictor

0.1 to 0.3Medium predictor

0.3 to 0.5Strong predictor

≥0.5 Suspicious or too good to be true

筛选出IV值大于0.05的变量：

'Num30-59late'，'Num60-89late'，'Num90late'，'AllNumlate'，'Revol'，'age'，'incomperhuman'，'MonthlyIncome'，'Numestate'



In [None]:
X_train_last = ori_data_train[['Num30-59late','Num60-89late','Num90late','AllNumlate','Revol','age','incomperhuman','MonthlyIncome','Numestate']]
# X_train_last = ori_data_train.drop(["Numdepend",'ID','Isdlq'],axis = 1)
X_train = ori_data_train.drop(['ID','Isdlq'],axis = 1)
Y_train = ori_data_train.iloc[:,1]
# X_test_last = data_test.drop(["Numdepend",'ID','Isdlq'],axis = 1)
X_test_last = data_test[['Num30-59late','Num60-89late','Num90late','AllNumlate','Revol','age','incomperhuman','MonthlyIncome','Numestate']]

In [None]:
# data_test.isnull().sum()

# **四.模型预测**

为了评价模型以及后续调参，我们先定义一个ROC曲线绘制函数

In [None]:
# ROC曲线绘制
def draw_roc(FPR, TPR, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(FPR, TPR,'b', linewidth=2, label=label)
    plt.plot([0,1],[0,1], "r--") 
    plt.xlim([0, 1])   
    plt.ylim([0, 1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

## 4.1 RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# forest = RandomForestClassifier(n_estimators=300, max_depth=5, class_weight='balanced')
# forest.fit(X_train_last, Y_train)
# forest_scores_proba = forest.predict_proba(X_train_last)
# forest_scores = forest_scores_proba[:,1]
# FPR_forest, TPR_forest, THRESH_forest = roc_curve(Y_train, forest_scores)
# AUC_forest=roc_auc_score(Y_train,forest_scores)
# draw_roc(FPR_forest, TPR_forest)
# print("RF在训练集上的AUC是: {:.5f}%".format(AUC_forest*100))

再通过交叉验证来检验模型的泛化能力  
取$K—Fold$中的$K=10$

In [None]:
# AUC_forest_cv = cross_val_score(forest, X_train, Y_train, cv=10, scoring='roc_auc').mean()
# print("RF在训练集上cv的AUC是：{:.5f}%".format(AUC_forest_cv*100))

## 4.2梯度提升

In [None]:
# GBC = GradientBoostingClassifier()
# GBC.fit(X_train_last, Y_train)
# GBC_scores_proba =GBC.predict_proba(X_train_last)
# GBC_scores = GBC_scores_proba[:,1]
# FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
# AUC_GBC=roc_auc_score(Y_train,GBC_scores)
# draw_roc(FPR_GBC, TPR_GBC)
# print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

同样通过交叉验证来检验模型的泛化能力  
取$K—Fold$中的$K=10$

In [None]:
# AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
# print("GBC在训练集上cv的AUC是：{:.5f}%".format(AUC_GBC_cv*100))

发现在两种方法中，**梯度上升**比随机森林的效果要好  
下面对梯度上升中的参数进行调参

# **五.参数调优**

## 5.1  **n_estimators**调优  

In [None]:
# estimators=np.linspace(100,300,21).astype(int)
# AUCs=[]
# nums=[]
# for estimator in estimators:
#     GBC = GradientBoostingClassifier(n_estimators=estimator, learning_rate=0.1 ,max_depth=4)
#     GBC.fit(X_train_last, Y_train)
#     GBC_scores_proba =GBC.predict_proba(X_train_last)
#     GBC_scores = GBC_scores_proba[:,1]
#     FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
#     AUC_GBC=roc_auc_score(Y_train,GBC_scores)
#     AUCs.append(AUC_GBC*100)
#     nums.append(estimator)
# plt.plot(nums,AUCs)

可以发现增长越来越缓慢，为了防止过拟合，我们就选取n=250

 ## 5.2 max_depth调优

In [None]:
# depths = np.linspace(1,12,12).astype(int)
# AUCs=[]
# maxdepths=[]
# for depth in depths:
#     GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth=depth)
#     GBC.fit(X_train_last, Y_train)
#     GBC_scores_proba =GBC.predict_proba(X_train_last)
#     GBC_scores = GBC_scores_proba[:,1]
#     FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
#     AUC_GBC=roc_auc_score(Y_train,GBC_scores)
#     AUCs.append(AUC_GBC*100)
#     maxdepths.append(depth)
# plt.plot(maxdepths,AUCs)

提升比较明显为避免过拟合取maxdepths=5，看一下AUC

In [None]:
# GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.1 ,max_depth=5)
# GBC.fit(X_train_last, Y_train)
# GBC_scores_proba =GBC.predict_proba(X_train_last)
# GBC_scores = GBC_scores_proba[:,1]
# FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
# AUC_GBC=roc_auc_score(Y_train,GBC_scores)
# draw_roc(FPR_GBC, TPR_GBC)
# print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

还是有一些过拟合，降低一下learning_rate

In [None]:
GBC = GradientBoostingClassifier(n_estimators=250, learning_rate=0.05 ,max_depth = 5)
GBC.fit(X_train_last, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train_last)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))

# 六.提交结果

In [None]:
submission_proba = GBC.predict_proba(X_test_last)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)