In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 实验流程

1. 实验背景介绍
2. 获取实验数据
3. 探索性数据分析
4. 数据预处理
5. 建立模型预测
6. 向Kaggle提供结果

# 1.实验背景介绍

## 1.1问题描述

银行在市场经济中扮演着重要的角色。他们决定谁可以在什么条件下得到贷款。

尝试帮助银行建立一个信用评分算法用来判断银行是否应该发放贷款，它对借贷人违约概率进行猜测。要求通过预测借贷人在未来两年内遭遇财务困境的可能性，来提高信用评分算法的水平。

目标是建立一个基于借款人信息的最佳财务决策模型，训练集提供了25万借款人的历史数据。

## 1.2数据描述

In [None]:
pip install xlrd

In [None]:
import xlrd
import pandas as pd
Dict_df = pd.read_excel('../input/GiveMeSomeCredit/Data Dictionary.xls') 
Dict_df

## 1.3各字段含义

| 字段名 | 定义    | 值  |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs|个人经历了超过90天的拖欠或更糟的情况(优质客户或劣质客户以此结果表示是否借贷)| Y = yes / N = No |
| RevolvingUtilizationOfUnsecuredLines	 |信用卡和个人信贷余额的总余额，减去房地产和没有分期付款的债务（如：汽车贷款）除以信用总额度(可用信用额度比值) | 百分比 |     
| age   | 借款人借款时年龄   | 整数 |
| NumberOfTime30-59DaysPastDueNotWorse | 借款人逾期30~59天的次数，但过去2年没有更差的信用记录| 整数|
| DebtRatio | 每月债务支付、赡养费和生活费用之和除以月总收入（负债比率） |百分比 |      
| MonthlyIncome    | 月收入  |   总数   |
| NumberOfOpenCreditLinesAndLoans | 开放贷款的数量（如汽车贷款或抵押贷款）和信用额度（如信用卡）  | 整数 |
| NumberOfTimes90DaysLate   | 借款人逾期90天或以上的次数 |整数 |
| NumberRealEstateLoansOrLines   | 抵押贷款和房地产贷款的数量，包括房屋净值信贷额度    | 整数  |
| NumberOfTime60-89DaysPastDueNotWorse| 借款人逾期60~90天的次数，但在过去2年没有更差的信用记录   | 整数 |
| NumberOfDependents | 不包括自己在内的家属（配偶，子女等）人数 |整数 |

# 2.获取实验数据

**导包**

In [None]:
# 数据整理和分析
import pandas as pd
import numpy as np
import random as rnd
from collections import Counter
from pandas import Series,DataFrame
import scipy.stats as stats

# 可视化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 机器学习
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier as XGB

**导入数据**

In [None]:
## 获取实验数据
train_df = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
test_df = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')
combine = [train_df, test_df]

**观察数据**

In [None]:
train_df.info()
print('\n'+'_'*40+'\n')
test_df.info()

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
test_df.tail()

* 训练集数据共有150000条，测试集数据共有101503条
* 可以看到特征的数据类型主要是整型和浮点型，数据处理相对较为容易
* 在训练集中 MonthlyIncome， NumberOfDependents 字段存在缺失值
* 在测试集中 MonthlyIncome， NumberOfDependents 两个字段存在缺失值
* Unnamed:0 字段为标识符，作为借款人的身份标识信息

# 3.探索性数据分析

## 3.1 观察数值型特征的经验分布


数据集的经验分布有助于我们对数据集进行初步观察，判断经验分布能否代表真实分布

In [None]:
train_df.describe(percentiles=[.25, .50, .60, .70, .75, .80, .99])

从百分比可以看出：
* age出现了最小值为0的异常情况，需要进行处理
* NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse,NumberOfTimes90DaysLate分布都非常不平衡，可能存在异常值
* 从百分位数来看，RevolvingUtilizationOfUnsecuredLines，DebtRatio分布也非常不平衡，99%时与最大值相差甚远，说明存在异常值
* RevolvingUtilizationOfUnsecuredLines 和 DebtRatio 按照正常情况来看，分布应该在0-1之间，超过1显然说明信用透支了，或负债透支了

## 3.2观察分类结果 SeriousDlqin2yrs

In [None]:
plt.figure(figsize=[6, 4])
sns.countplot(x=train_df['SeriousDlqin2yrs'])

In [None]:
train_df.groupby('SeriousDlqin2yrs')['Unnamed: 0'].count().reset_index()

* 可以发现分类结果比，正负例接近14:1，数据不平衡会让监督学习算法过多关注多数类，使分类性能下降
* 故在训练需要注意在选择模型时应选择集成模型或者正则回归模型，同时对正例采取欠采样，对负例采取过采样

### **分析特征与标签的关系**

## 3.3  分析 age

In [None]:
plt.figure(figsize=[14, 10])
plt.subplot(221)
sns.boxplot(data=train_df['age'])
plt.ylabel('age')
plt.subplot(222)
sns.histplot(train_df['age'])
plt.xlabel('age')

* 从箱线图可以看到age存在异常值，出现了等于0岁或接近0岁的情况以及超过或接近100岁的情况
* 但从右侧图中可以看到age大体上符合正态分布

将年龄进行分段按照成年后的青壮年，中年，老年，高龄进行分段，各20年为跨度，观察各年龄阶段的违约率

In [None]:
age_1=train_df.loc[(train_df['age']>=18)&(train_df['age']<40),:]
per_1=age_1.loc[age_1['SeriousDlqin2yrs']==1,:].shape[0]*100/age_1.shape[0]

age_2=train_df.loc[(train_df['age']>=40)&(train_df['age']<60),:]
per_2=age_2.loc[age_2['SeriousDlqin2yrs']==1,:].shape[0]*100/age_2.shape[0]

age_3=train_df.loc[(train_df['age']>=60)&(train_df['age']<80),:]
per_3=age_3.loc[age_3['SeriousDlqin2yrs']==1,:].shape[0]*100/age_3.shape[0]

age_4=train_df.loc[train_df['age']>=80,:]
per_4=age_4.loc[age_4['SeriousDlqin2yrs']==1,:].shape[0]*100/age_4.shape[0]
temp_data = {
    'age':['18-40','40-60','60-80','>80'],
    'SeriousDlqin2yrs_per':[per_1, per_2, per_3, per_4]
}

temp_df = DataFrame(temp_data)
temp_df

- 可以发现在18-40这一年龄阶段违约率最高。但随着年龄增加，违约率逐渐降低

## 3.4  分析 RevolvingUtilizationOfUnsecuredLines 和 DebtRatio

**`RevolvingUtilizationOfUnsecuredLines`**

该字段含义可以理解为信用可用额度比值

In [None]:
plt.figure(figsize=[6, 4])
sns.boxplot(data=train_df['RevolvingUtilizationOfUnsecuredLines'])
plt.ylabel('RevolvingUtilizationOfUnsecuredLines')

信用可用额度比值按照含义理解应该在0-1之间，超出1表明信用透支。但从箱线图中可以看出出现了一些非常大的值如50000等，可能为异常值需要进行处理

首先将RevolvingUtilizationOfUnsecuredLines分为大于1的部分和小于1的部分，再将大于1的部分细分，观察大于1的部分，尝试找出异常值的范围

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.histplot(train_df.loc[train_df['RevolvingUtilizationOfUnsecuredLines']<1,'RevolvingUtilizationOfUnsecuredLines'],bins=10)
plt.subplot(122)
sns.histplot(train_df.loc[train_df['RevolvingUtilizationOfUnsecuredLines']>=1,'RevolvingUtilizationOfUnsecuredLines'],bins=10)

In [None]:
fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(2,2,figsize=(16,8))
sns.histplot(train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=1)&(train_df['RevolvingUtilizationOfUnsecuredLines']<100),'RevolvingUtilizationOfUnsecuredLines'],bins=10,ax=ax1)
sns.histplot(train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=100)&(train_df['RevolvingUtilizationOfUnsecuredLines']<1000),'RevolvingUtilizationOfUnsecuredLines'],bins=10,ax=ax2)
sns.histplot(train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=1000)&(train_df['RevolvingUtilizationOfUnsecuredLines']<10000),'RevolvingUtilizationOfUnsecuredLines'],bins=10,ax=ax3)
sns.histplot(train_df.loc[train_df['RevolvingUtilizationOfUnsecuredLines']>=10000,'RevolvingUtilizationOfUnsecuredLines'],bins=10,ax=ax4)

将信用可用额度比值进行分段处理，查看各段的违约率

In [None]:
#将区间分为（0-1），(1-10),（10-20），（20-100），（100,1000），（1000-10000），（10000,51000）看一下违约率情况
temp_1=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=0)&(train_df['RevolvingUtilizationOfUnsecuredLines']<1),:]
per_1=temp_1.loc[temp_1['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_1.shape[0]

temp_2=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=1)&(train_df['RevolvingUtilizationOfUnsecuredLines']<10),:]
per_2=temp_2.loc[temp_2['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_2.shape[0]

temp_3=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=10)&(train_df['RevolvingUtilizationOfUnsecuredLines']<20),:]
per_3=temp_3.loc[temp_3['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_3.shape[0]

temp_4=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=20)&(train_df['RevolvingUtilizationOfUnsecuredLines']<100),:]
per_4=temp_4.loc[temp_4['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_4.shape[0]

temp_5=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=100)&(train_df['RevolvingUtilizationOfUnsecuredLines']<1000),:]
per_5=temp_5.loc[temp_5['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_5.shape[0]

temp_6=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=1000)&(train_df['RevolvingUtilizationOfUnsecuredLines']<10000),:]
per_6=temp_6.loc[temp_6['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_6.shape[0]

temp_7=train_df.loc[(train_df['RevolvingUtilizationOfUnsecuredLines']>=10000),:]
per_7=temp_7.loc[temp_7['SeriousDlqin2yrs']==1,:].shape[0]*100/temp_7.shape[0]

temp_data = {
    'RevolvingUtilizationOfUnsecuredLines':['0-1','1-10','10-20','20-100','100-1000','1000-10000','>10000'],
    'SeriousDlqin2yrs_per':[per_1, per_2, per_3, per_4, per_5, per_6, per_7]
}
temp_df = DataFrame(temp_data)
temp_df

可以看出RevolvingUtilizationOfUnsecuredLines大于1后，属于信用透支范围，违约率开始上升，信用可用额度比值在10-20之间违约率达到最大值，超过20后违约率开始下降，超过1000后开始急剧下降。猜测异常值阈值应该在20-100之间取得

**`DebtRatio`**

DebtRatio 字段含义表示负债率，但从下图可以看出出现了超过50000以上的值，应该为异常值

In [None]:
plt.figure(figsize=[6, 4])
sns.boxplot(data=train_df['DebtRatio'])
plt.ylabel('DebtRatio')

将负债率DebtRatio分为大于1和小于1的部分，将大于1的部分进行细分，观察大于1的部分，尝试找出异常值的范围

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.histplot(train_df.loc[train_df['DebtRatio']<1,'DebtRatio'],bins=10)
plt.subplot(122)
sns.histplot(train_df.loc[train_df['DebtRatio']>=1,'DebtRatio'],bins=10)
plt.xlabel('DebtRatio')

In [None]:
fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(2,2,figsize=(20,10))
sns.histplot(train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<2),'DebtRatio'],bins=10,ax=ax1)
sns.histplot(train_df.loc[(train_df['DebtRatio']>=2)&(train_df['DebtRatio']<10),'DebtRatio'],bins=10,ax=ax2)
sns.histplot(train_df.loc[(train_df['DebtRatio']>=10)&(train_df['DebtRatio']<200),'DebtRatio'],bins=10,ax=ax3)
sns.histplot(train_df.loc[(train_df['DebtRatio']>=200)&(train_df['DebtRatio']<1000),'DebtRatio'],bins=10,ax=ax4)

将负债率进行分段处理，查看各段的违约率

In [None]:
#将区间分为（0-1），(1-2),（2-10），（10-50），（50,200），（200-1000），>1000 看一下违约率情况
Debt_1=train_df.loc[(train_df['DebtRatio']>=0)&(train_df['DebtRatio']<1),:]
per_1=Debt_1.loc[Debt_1['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_1.shape[0]

Debt_2=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<2),:]
per_2=Debt_2.loc[Debt_2['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_2.shape[0]

Debt_3=train_df.loc[(train_df['DebtRatio']>=2)&(train_df['DebtRatio']<10),:]
per_3=Debt_3.loc[Debt_3['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_3.shape[0]

Debt_4=train_df.loc[(train_df['DebtRatio']>=10)&(train_df['DebtRatio']<50),:]
per_4=Debt_4.loc[Debt_4['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_4.shape[0]

Debt_5=train_df.loc[(train_df['DebtRatio']>=50)&(train_df['DebtRatio']<200),:]
per_5=Debt_5.loc[Debt_5['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_5.shape[0]

Debt_6=train_df.loc[(train_df['DebtRatio']>=200)&(train_df['DebtRatio']<1000),:]
per_6=Debt_6.loc[Debt_6['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_6.shape[0]

Debt_7=train_df.loc[(train_df['DebtRatio']>=1000),:]
per_7=Debt_7.loc[Debt_7['SeriousDlqin2yrs']==1,:].shape[0]*100/Debt_7.shape[0]

temp_data = {
    'DebtRatio':['0-1','1-2','2-10','10-50','50-200','200-1000','>1000'],
    'SeriousDlqin2yrs_per':[per_1, per_2, per_3, per_4, per_5, per_6, per_7]
}

temp_df = DataFrame(temp_data)
temp_df

从各阶段的违约率可以看出，负债率为1-2的时候违约率最高，超过2以后违约率开始分布均衡，同时与负债率为0-1时的违约率大致相同。

### 3.5  分析 MonthlyIncome

In [None]:
plt.figure(figsize=[8, 6])
sns.boxplot(data=train_df['MonthlyIncome'])
plt.ylabel('MonthlyIncome')

将MonthlyIncome分为大于等于200000的部分，和小于200000的部分

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.histplot(train_df.loc[train_df['MonthlyIncome']<200000,'MonthlyIncome'],bins=10)
plt.subplot(122)
sns.histplot(train_df.loc[train_df['MonthlyIncome']>=200000,'MonthlyIncome'],bins=10)
plt.xlabel('MonthlyIncome')

* 从之前的数据集的信息中可以看到 MonthlyIncome 存在较多的缺失值
* 可以看到 MonthlyIncome 字段大部分在200000以内
* 但存在一些异常值，存在一部分大于200000的值，需要进行处理

### 3.6  分析 NumberOfTime30-59DaysPastDueNotWorse字段, NumberOfTime60-89DaysPastDueNotWorse字段 和 NumberOfTimes90DaysLate字段

In [None]:
plt.figure(figsize=[16, 20])
plt.subplot(331)
sns.boxplot(data=train_df['NumberOfTime30-59DaysPastDueNotWorse'])
plt.ylabel('NumberOfTime30-59DaysPastDueNotWorse')
plt.subplot(332)
sns.boxplot(data=train_df['NumberOfTime60-89DaysPastDueNotWorse'])
plt.ylabel('NumberOfTime60-89DaysPastDueNotWorse')
plt.subplot(333)
sns.boxplot(data=train_df['NumberOfTimes90DaysLate'])
plt.ylabel('NumberOfTimes90DaysLate')

- NumberOfTime30-59DaysPastDueNotWorse 字段,  NumberOfTime60-89DaysPastDueNotWorse 字段 和 NumberOfTimes90DaysLate 字段 都存在离群点，存在大于90的异常值
- 可以看到上述三个字段的值的分布大多都为0

### 3.7 分析 NumberRealEstateLoansOrLines

该字段含义为 抵押贷款和房地产贷款的数量

In [None]:
plt.figure(figsize=[14, 10])
plt.subplot(221)
sns.boxplot(data=train_df['NumberRealEstateLoansOrLines'])
plt.ylabel('NumberRealEstateLoansOrLines')
plt.subplot(222)
sns.countplot(x=train_df['NumberRealEstateLoansOrLines'])
plt.xlabel('NumberRealEstateLoansOrLines')

从箱线图中可以看出有一个明显大于50的离群点，在异常值处理时需要注意

In [None]:
# 统计NumberRealEstateLoansOrLines各值的数量
train_df['NumberRealEstateLoansOrLines'].value_counts()

将大于8的数据和8合并后看一下违约率的情况

In [None]:
data_df=train_df
data_df.loc[data_df['NumberRealEstateLoansOrLines']>8,'NumberRealEstateLoansOrLines']=8
LoansOrLines_sum=data_df.groupby(['NumberRealEstateLoansOrLines'])['SeriousDlqin2yrs'].sum()
LoansOrLines_total=data_df.groupby(['NumberRealEstateLoansOrLines'])['SeriousDlqin2yrs'].count()
LoansOrLines_ratio=LoansOrLines_sum/LoansOrLines_total
LoansOrLines_ratio.plot(kind='bar',figsize=(8,6))

* 抵押贷款和房地产贷款的贷款数量为0时并不是违约率最低的，抵押贷款和房地产贷款数量为1时违约率最低。
* 随着抵押贷款和房地产贷款数量增加，违约率也随着增高。

### 3.8  分析 NumberOfDependents

该字段含义为不包括自己在内的家属（配偶，子女等）人数

In [None]:
plt.figure(figsize=[14, 10])
plt.subplot(221)
sns.boxplot(data=train_df['NumberOfDependents'])
plt.ylabel('NumberOfDependents')
plt.subplot(222)
sns.countplot(x=train_df['NumberOfDependents'])
plt.xlabel('NumberOfDependents')

In [None]:
# 统计NumberOfDependents的非空值
train_df['NumberOfDependents'].value_counts()

可以看到大部分借款人都没有家属，在填补缺失值时，可以填补中位数或0

### 3.9 分析  NumberOfOpenCreditLinesAndLoans

In [None]:
plt.figure(figsize=[12, 10])
plt.subplot(211)
sns.boxplot(data=train_df['NumberOfOpenCreditLinesAndLoans'])
plt.ylabel('NumberOfOpenCreditLinesAndLoans')
plt.subplot(212)
sns.countplot(x=train_df['NumberOfOpenCreditLinesAndLoans'])
plt.xlabel('NumberOfOpenCreditLinesAndLoans')

可以看出 NumberOfOpenCreditLinesAndLoans 大于36的部分数据较少，将大于36的部分合并观察各部分的违约率

In [None]:
data_df=train_df
data_df.loc[data_df['NumberOfOpenCreditLinesAndLoans']>36,'NumberOfOpenCreditLinesAndLoans']=36
Loans_sum=data_df.groupby(['NumberOfOpenCreditLinesAndLoans'])['SeriousDlqin2yrs'].sum()
Loans_total=data_df.groupby(['NumberOfOpenCreditLinesAndLoans'])['SeriousDlqin2yrs'].count()
Loans_ratio=Loans_sum/Loans_total
Loans_ratio.plot(kind='bar',figsize=(12,6))

* 可以看到 开放贷款的数量（如汽车贷款或抵押贷款）和信用额度（如信用卡）NumberOfOpenCreditLinesAndLoans为0时并非违约率最低
* 开放贷款的数量和信用额度从0到8，违约率逐渐下降
* 开放贷款的数量和信用额度从8到15，违约率开始逐渐上升
* 开放贷款的数量和信用额度从16之，后违约率开始逐渐波动

# 4.数据预处理

## 4.1缺失值处理

* 训练集缺失：MonthlyIncome, NumberOfDependents
* 测试集缺失：MonthlyIncome, NumberOfDependents 

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

**NumberOfDependents**

NumberOfDependents 亲属值缺失较少，根据之前分析可以采用中位数或0填充

In [None]:
combine = [train_df, test_df]
for data in combine:
    data['NumberOfDependents'].fillna(data['NumberOfDependents'].median(), inplace=True)

**MonthlyIncome**

MonthlyIncome的缺失值较多，故根据变量之间的关系，采取随机森林的方式来填补缺失值

In [None]:
# 采用随机森林预测缺失值
for data in combine:
    data_Forest=data.iloc[:,[6,2,3,4,5,7,8,9,10,11]]
    Unknown=data_Forest.loc[data['MonthlyIncome'].isnull(),:]
    Known=data_Forest.loc[data['MonthlyIncome'].notnull(),:]

    X=Known.iloc[:,1:].values
    y=Known.iloc[:,0].values
    Random_forest=RandomForestRegressor(max_depth=3, random_state=0,n_estimators=200,n_jobs=-1)
    Random_forest.fit(X,y)
    fillvalue=Random_forest.predict(Unknown.iloc[:,1:].values).round(0)

    data.loc[data['MonthlyIncome'].isnull(),'MonthlyIncome']=fillvalue

检查是否还存在缺失值

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

**检查是否存在重复值**

In [None]:
# 查看训练集中是否存在重复值
print(train_df.drop(['Unnamed: 0'], axis=1).duplicated().value_counts())

In [None]:
# 删除作为借款人身份标识信息的列
for data in combine:
    data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# 删除重复值
train_df.drop_duplicates(inplace=True)

## 4.2异常值处理

**age**

In [None]:
train_df.loc[train_df['age'] < 18]

存在一个年龄等于0的，选择直接删除

In [None]:
train_df = train_df[train_df['age'] > 0]

**NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate**

删除大于90的异常值

In [None]:
train_df=train_df[train_df['NumberOfTime30-59DaysPastDueNotWorse']<90]
train_df=train_df[train_df['NumberOfTime60-89DaysPastDueNotWorse']<90]
train_df=train_df[train_df['NumberOfTimes90DaysLate']<90]

**NumberRealEstateLoansOrLines, NumberOfOpenCreditLinesAndLoans 和 NumberOfDependents**

In [None]:
train_df=train_df[train_df['NumberRealEstateLoansOrLines']<16]
train_df=train_df[train_df['NumberOfDependents']<10]
train_df=train_df[train_df['NumberOfOpenCreditLinesAndLoans']<37]

**RevolvingUtilizationOfUnsecuredLines 和 DebtRatio**

In [None]:
train_df=train_df[train_df['RevolvingUtilizationOfUnsecuredLines']<100]
train_df=train_df[train_df['DebtRatio']<50000]

**MonthlyIncome**

In [None]:
train_df=train_df[train_df['MonthlyIncome']<200000]

## 4.3特征工程

**构造新特征**

Notalone: 是否有亲属

In [None]:
combine = [train_df, test_df]
for data in combine:
    data['Notalone']=data['NumberOfDependents']
    data.loc[(data['Notalone']>0),'Notalone']=1
    data['Notalone']=data['Notalone'].astype('int64')

Late: 是否有逾期还款的情况

In [None]:
for data in combine:
    data['Latecount']=data['NumberOfTime30-59DaysPastDueNotWorse']+data['NumberOfTime60-89DaysPastDueNotWorse']+data['NumberOfTimes90DaysLate']
    data['Latecount']=data['Latecount'].astype('int64')

Monthlypayment： 每个月要归还的额度

In [None]:
for data in combine:
    data['Monthlypayment']=data['DebtRatio']*data['MonthlyIncome']
    data['MonthlyIncome']=data['MonthlyIncome'].astype('int64')
    data['Monthlypayment']=data['Monthlypayment'].astype('int64')

In [None]:
train_df.head()

**相关分析**

计算特征之间的相关系数

In [None]:
corr=train_df.corr()
plt.figure(figsize=(12,6))
sns.heatmap(corr,annot=True,linewidths=.3,cmap='YlGnBu')

- 从热力图可以发现'NumberOfTime30-59DaysPastDueNotWorse'和'LateCount'两个特征的相关系数为0.82，'DebtRatio'和'Monthlypayment'两个特征的相关系数为0.9，'NumberOfDependents' 和 'Notalone'两个特征的相关系数为0.83
- SeriousDlqin2yr 与多个特征存在一定的关系
- 计算变量的IV值，两个变量相关性较高时，选择IV值高的变量。

**分箱**

连续变量特征分箱

In [None]:
import scipy.stats as stats

def monoto_bin(Y, X, n):
    r = 0
    total_good = Y.sum()
    total_bad =Y.count()-total_good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.min().X, columns = ['min_' + X.name])
    d3['min_' + X.name] = d2.min().X
    d3['max_' + X.name] = d2.max().X
    d3[Y.name] = d2.sum().Y
    d3['total'] = d2.count().Y
    
    #好坏比，求woe,证据权重，自变量对目标变量有没有影响，什么影响
    d3['goodattr']=d3[Y.name]/total_good
    d3['badattr']=(d3['total']-d3[Y.name])/total_bad
    d3['woe'] = np.log(d3['goodattr']/d3['badattr'])
    
    #iv，信息值，自变量对于目标变量的影响程度
    iv = ((d3['goodattr']-d3['badattr'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min_' + X.name)).reset_index(drop = True)
    print ("=" * 80)
    print (d4)
    cut = []
    cut.append(float('-inf'))
    for i in range(1,n+1):
        qua =X.quantile(i/(n+1))
        cut.append(round(qua,4))
    cut.append(float('inf'))
    woe = list(d4['woe'].round(3))
    return d4,iv,cut,woe

age,RevolvingUtilizationOfUnsecuredLines,DebtRatio 实现自动优化分箱

In [None]:
dfx1,ivx1,cutx1,woex1 = monoto_bin(train_df['SeriousDlqin2yrs'],train_df['RevolvingUtilizationOfUnsecuredLines'],n = 10)
dfx2,ivx2,cutx2,woex2 = monoto_bin(train_df['SeriousDlqin2yrs'],train_df['age'],n = 10)
dfx4,ivx4,cutx4,woex4 = monoto_bin(train_df['SeriousDlqin2yrs'],train_df['DebtRatio'],n = 10)

对于不能用最优分段的变量，采用自定义分箱，进行等距分段。

In [None]:
def self_bin(Y,X,cat):
    good=Y.sum()
    bad=Y.count()-good
    d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,cat)})
    d2=d1.groupby('Bucket', as_index = True)
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_values(by='min'))
    print("=" * 60)
    print(d4)
    woe = list(d4['woe'].round(3))
    return d4, iv,woe

In [None]:
pinf = float('inf')#正无穷大
ninf = float('-inf')#负无穷大
cutx3 = [ninf, 0, 1, 2, 3, 4, 5, 6, 7, 8, pinf]
cutx5 = [ninf,1000,2000,3000,4000,5000,6000,7500,9500,12000,18000,25000,pinf]
cutx6 = [ninf, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, pinf]
cutx7 = [ninf, 0, 1, 2, 3, 4, 5, 6, 7, 8, pinf]
cutx8 = [ninf, 0, 1, 2, 3, 4, 5, 6, 7, 8, pinf]
cutx9 = [ninf, 0, 1, 2, 3, 4, 5, 6, 7, 8, pinf]
cutx10 = [ninf, 0, 1, 2, 3, 4, 5, 6, pinf]
cutx11 = [ninf, 0, pinf]
cutx12 = [ninf, 0, 1, 2, 3, 4, 6, 8, 10, 12, pinf]
cutx13 = [ninf,1000,2000,3000,4000,5000,6000,7500,9500,12000,18000, 25000,pinf]
dfx3, ivx3,woex3 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberOfTime30-59DaysPastDueNotWorse'],cutx3)
dfx5, ivx5,woex5 = self_bin(train_df['SeriousDlqin2yrs'],train_df['MonthlyIncome'],cutx5)
dfx6, ivx6,woex6 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberOfOpenCreditLinesAndLoans'],cutx6) 
dfx7, ivx7,woex7 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberOfTimes90DaysLate'],cutx7)
dfx8, ivx8,woex8 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberRealEstateLoansOrLines'],cutx8) 
dfx9, ivx9,woex9 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberOfTime60-89DaysPastDueNotWorse'],cutx9)
dfx10, ivx10,woex10 = self_bin(train_df['SeriousDlqin2yrs'],train_df['NumberOfDependents'],cutx10)
dfx11, ivx11,woex11 = self_bin(train_df['SeriousDlqin2yrs'],train_df['Notalone'],cutx11) 
dfx12, ivx12,woex12 = self_bin(train_df['SeriousDlqin2yrs'],train_df['Latecount'],cutx12)
dfx13, ivx13,woex13 = self_bin(train_df['SeriousDlqin2yrs'],train_df['Monthlypayment'],cutx13)

In [None]:
ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10,ivx11,ivx12,ivx13]#各变量IV
index=['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse',
       'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents', 'Notalone', 'Latecount',
       'Monthlypayment']
fig = plt.figure(figsize=(12,6))
ax= fig.add_subplot(111)
x = np.arange(len(index))+1
ax.bar(x, ivlist, width=0.4)
ax.set_xticks(x)  
ax.set_xticklabels(index, rotation=0, fontsize=12)
ax.set_ylabel('IV(Information Value)', fontsize=14)

for a, b in zip(x, ivlist):
    plt.text(a, b + 0.01, '%.4f' % b, ha = 'center', fontsize = 10)
plt.xticks(rotation=90) 
plt.show()

### 4.4降维

In [None]:
for data in combine:
    data.drop(['NumberOfDependents','DebtRatio','Notalone','Monthlypayment','NumberOfTime30-59DaysPastDueNotWorse'], axis=1, inplace=True)

In [None]:
train_df.head()


# 5.建立模型预测

**需要注意的是，模型的评价标准为AUC，即ROC曲线下方的面积大小**

In [None]:
# ROC曲线绘制
def draw_roc(FPR, TPR, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(FPR, TPR,'b', linewidth=2, label=label)
    plt.plot([0,1],[0,1], "r--") 
    plt.xlim([0, 1])   
    plt.ylim([0, 1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
X_train = train_df.drop(['SeriousDlqin2yrs'], axis=1)
Y_train = train_df['SeriousDlqin2yrs']
X_test = test_df.drop(['SeriousDlqin2yrs'], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

* 首先不进行超参数调节，直接对各个算法使用默认的参数拟合整个训练集，计算训练集上的准确率。
* 再通过10折交叉验证来估计模型的泛化能力。
* 主要采用集成模型和树模型。

**Logistic Regression**

In [None]:
LR = LogisticRegression(solver='lbfgs', max_iter=1000)
LR.fit(X_train, Y_train)
LR_scores_proba =LR.predict_proba(X_train)
LR_scores = LR_scores_proba[:,1]
FPR_LR, TPR_LR, THRESH_LR = roc_curve(Y_train, LR_scores)
AUC_LR=roc_auc_score(Y_train,LR_scores)
draw_roc(FPR_LR, TPR_LR)
print("LR在训练集上的AUC是: {:.5f}%".format(AUC_LR*100))
AUC_LR_cv = cross_val_score(LR, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("LR在训练集上cv的AUC是：{:.5f}%".format(AUC_LR_cv*100))

**RandomForest 随机森林**

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train, Y_train)
RF_scores_proba =RF.predict_proba(X_train)
RF_scores = RF_scores_proba[:,1]
FPR_RF, TPR_RF, THRESH_RF = roc_curve(Y_train, RF_scores)
AUC_RF=roc_auc_score(Y_train,RF_scores)
draw_roc(FPR_RF, TPR_RF)
print("RF在训练集上的AUC是: {:.5f}%".format(AUC_RF*100))
AUC_RF_cv = cross_val_score(RF, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("RF在训练集上cv的AUC是：{:.5f}%".format(AUC_RF_cv*100))

**XGBoost**

In [None]:
from xgboost import XGBClassifier as XGB

xgb = XGB()
xgb.fit(X_train, Y_train)
xgb_scores_proba =xgb.predict_proba(X_train)
xgb_scores = xgb_scores_proba[:,1]
FPR_xgb, TPR_xgb, THRESH_xgb = roc_curve(Y_train, xgb_scores)
AUC_xgb=roc_auc_score(Y_train,xgb_scores)
draw_roc(FPR_xgb, TPR_xgb)
print("xgb在训练集上的AUC是: {:.5f}%".format(AUC_xgb*100))
AUC_xgb_cv = cross_val_score(xgb, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("xgb在训练集上cv的AUC是：{:.5f}%".format(AUC_xgb_cv*100))

**GradientBoost 梯度提升法**

In [None]:
GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train)
GBC_scores = GBC_scores_proba[:,1]
FPR_GBC, TPR_GBC, THRESH_GBC = roc_curve(Y_train, GBC_scores)
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
draw_roc(FPR_GBC, TPR_GBC)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))
AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("GBC在训练集上cv的AUC是：{:.5f}%".format(AUC_GBC_cv*100))

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'GradientBoost'],
    'Score': [AUC_LR,AUC_RF,AUC_xgb,AUC_GBC],
    'CV-Score': [AUC_LR_cv,AUC_RF_cv,AUC_xgb_cv,AUC_GBC_cv]
})
models.sort_values(by='CV-Score', ascending=False)

**超参数调节**

选择GradientBoost

调节 n_estimators

In [None]:
# n_estimators = 60
#from sklearn.model_selection import GridSearchCV, StratifiedKFold
#GBC = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,min_samples_leaf=20,
#                                 max_depth=8,max_features='sqrt', subsample=0.8,random_state=10)                               
#gbc_param_grid = {
#    'n_estimators':range(20,81,10)
#}
#kfold = StratifiedKFold(n_splits=10)
#gsGBC = GridSearchCV(GBC, param_grid=gbc_param_grid, cv=kfold, scoring="roc_auc", verbose=1)
#gsGBC.fit(X_train, Y_train)
#gsGBC.best_params_, gsGBC.best_score_

调节 max_depth

In [None]:
# max_depth = 7
#from sklearn.model_selection import GridSearchCV, StratifiedKFold
#GBC = GradientBoostingClassifier(learning_rate=0.1 ,n_estimators = 60, min_samples_leaf=20,
#                                 max_features='sqrt', subsample=0.8,random_state=10)
#gbc_param_grid = {
#    'max_depth':range(3,12,2), 
#    'min_samples_split':range(100,801,200)
#}
#kfold = StratifiedKFold(n_splits=10)
#gsGBC = GridSearchCV(GBC, param_grid=gbc_param_grid, cv=kfold, scoring="roc_auc", verbose=1)
#gsGBC.fit(X_train, Y_train)
#gsGBC.best_params_, gsGBC.best_score_

调节 min_samples_split 和 min_samples_leaf

In [None]:
# min_samples_split=1800 min_samples_leaf=60
#from sklearn.model_selection import GridSearchCV, StratifiedKFold
#GBC = GradientBoostingClassifier(learning_rate=0.1 ,n_estimators = 60, max_depth=7,
#                                 max_features='sqrt', subsample=0.8,random_state=10)                               
#gbc_param_grid = {
#    'min_samples_split':range(800,1900,200), 
#    'min_samples_leaf':range(40,81,10)
#}
#kfold = StratifiedKFold(n_splits=10)
#gsGBC = GridSearchCV(GBC, param_grid=gbc_param_grid, cv=kfold, scoring="roc_auc", verbose=1)
#gsGBC.fit(X_train, Y_train)
#gsGBC.best_params_, gsGBC.best_score_

调节 max_features

In [None]:
# max_features = 3
#from sklearn.model_selection import GridSearchCV, StratifiedKFold
#GBC = GradientBoostingClassifier(learning_rate=0.1 ,n_estimators = 60, max_depth=7, min_samples_leaf = 60,
#                                 min_samples_split = 1800, subsample=0.8,random_state=10)
#gbc_param_grid = {
#    'max_features':range(1,20,2)
#}
#kfold = StratifiedKFold(n_splits=10)
#gsGBC = GridSearchCV(GBC, param_grid=gbc_param_grid, cv=kfold, scoring="roc_auc", verbose=1)
#gsGBC.fit(X_train, Y_train)
#gsGBC.best_params_, gsGBC.best_score_

调节subsample

In [None]:
# subsample = 0.8
#from sklearn.model_selection import GridSearchCV, StratifiedKFold
#GBC = GradientBoostingClassifier(learning_rate=0.1 ,n_estimators = 60, max_depth=7, min_samples_leaf = 60,
#                                 max_features=3, min_samples_split = 1800, random_state=10)
#gbc_param_grid = {
#    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,0.95]
#}

# Cross validate model with Kfold stratified cross val
#kfold = StratifiedKFold(n_splits=10)
#gsGBC = GridSearchCV(GBC, param_grid=gbc_param_grid, cv=kfold, scoring="roc_auc", verbose=1)
#gsGBC.fit(X_train, Y_train)
#gsGBC.best_params_, gsGBC.best_score_

In [None]:
GBC = GradientBoostingClassifier(learning_rate=0.01 ,n_estimators = 1200, max_depth=7, min_samples_leaf = 60,
                                 max_features=3, min_samples_split = 1800,subsample=0.8, random_state=10)
GBC.fit(X_train, Y_train)
GBC_scores_proba =GBC.predict_proba(X_train)
GBC_scores = GBC_scores_proba[:,1]
AUC_GBC=roc_auc_score(Y_train,GBC_scores)
print("GBC在训练集上的AUC是: {:.5f}%".format(AUC_GBC*100))
AUC_GBC_cv = cross_val_score(GBC, X_train, Y_train, cv=10, scoring='roc_auc').mean()
print("GBC在训练集上cv的AUC是：{:.5f}%".format(AUC_GBC_cv*100))

In [None]:
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'feature importances': GBC.feature_importances_
})
feature_importance.sort_values(by='feature importances', ascending=False)

查看各个特征在模型中的重要性，可以看到Latecount, NumberOfTimes90DaysLate, RevolvingUtilizationOfUnsecuredLines比较重要


# 6.向Kaggle提供结果

In [None]:
submission_proba = GBC.predict_proba(X_test)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)