In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 七步流程

1. 定义问题
2. 获取训练数据和测试数据
3. 整理、准备、清洗数据
4. 分析、发现模式、探索数据
5. 建模、预测、求解问题
6. 可视化、报告、呈现问题求解步骤和最终结论
7. 提交

# 1. 定义问题

训练集包含一些贷款人的样本并给出了是否违约的标记，训练一个模型预测测试集中贷款人的违约概率。

# 2. 获取数据

**导包**

In [None]:
# 数据整理和分析
import pandas as pd
import numpy as np
import random as rnd

# 可视化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 机器学习
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

**2.1 导入数据**

In [None]:
train_df = pd.read_csv('../input/give-me-some-credit-dataset/cs-training.csv',index_col=[0])
test_df = pd.read_csv('../input/give-me-some-credit-dataset/cs-test.csv',index_col=[0])

**2.2 查看数据集信息**

**数据集中包含哪些特征：**

| 字段名 | 定义                                 |                                             |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs |是否逾期|
| RevolvingUtilizationOfUnsecuredLines | 除房地产外和分期贷款(如汽车贷款)的信用卡余额和个人信用额度总和/信用额度总和                                   |                                |
| age   | 贷款人年龄                               |                       |
| NumberOfTime30-59DaysPastDueNotWorse | 过去2年，借款人逾期30-59天的次数|
| DebtRatio      | 负债比率                                |                                                |
| MonthlyIncome      | 月收入                       |                                                |
| NumberOfOpenCreditLinesAndLoans    | 未偿还贷款数量，开放式贷款（分期付款如汽车贷款或抵押贷款）和信贷（如信用卡）的数量                      |
| NumberOfTimes90DaysLate   | 借款人逾期90天或以上的次数 |                                                |
| NumberRealEstateLoansOrLines  | 抵押贷款和房地产贷款的数量                              |                                                |
| NumberOfTime60-89DaysPastDueNotWorse     | 过去2年，借款人逾期60-89天的次数                             |                                                |
|  NumberOfDependents    | 不包括本人在内的家属数量(配偶，子女等)                               |                                                |

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

**哪些特征是分类型(定性)的**

不仅应该识别出哪些是分类型特征，最好能细分为标称型、序数型

- 标称型特征有：SeriousDlqin2yrs
- 此数据集中没有序数型特征

**哪些特征是数值型(定量)的**

根据取值还应该区分连续的和离散的
- 连续型特征有：RevolvingUtilizationOfUnsecuredLines、age、DebtRatio、MonthlyIncome、NumberOfTime30-59DaysPastDueNotWorse、NumberOfOpenCreditLinesAndLoans、NumberOfTimes90DaysLate、NumberRealEstateLoansOrLines、NumberOfTime60-89DaysPastDueNotWorse
- 离散型特征有：NumberOfDependents

**哪些特征是混合类型的**

- 此数据集中无特征是混合类型的

**哪些特征的值不规范**

- 此数据集中没有特征的值可能不规范

**哪些特征具有缺失值**

这些缺失值后续需要处理

- 训练集：MonthlyIncome、NumberOfDependents有缺失值
- 测试集：MonthlyIncome、NumberOfDependents有缺失值

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

**哪些特征具有异常值(离群值)**
- 除了标称型特征SeriousDlqin2yrs外，所有特征都具有异常值

In [None]:
#异常值检测
from collections import Counter
def detect_outliers(df):
    """
    df:a dataframe of the dataset
    
    return:the indices of outlier samples
    """
    
    outlier_indices=[]
    
    for col in df.columns:
        Q1 = np.percentile(df[col].dropna(),25)  #revision:drop nan
        Q3 = np.percentile(df[col].dropna(),75)
        IQR = Q3 - Q1 #the Interquarrile Range: box length
        outlier_step = 1.5 * IQR
        
        #the list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < (Q1 - outlier_step))|(df[col] > (Q3 + outlier_step))].index
        if len(outlier_list_col)!=0:
            outlier_indices.append(col)
        
    return outlier_indices
    
outliers = detect_outliers(train_df)
print(outliers)

**2.3 查看描述统计信息**

In [None]:
train_df.describe()

In [None]:
test_df.describe()

从上述结果来看，训练集数据有以下特征：
* age特征出现了0，不合理，我们用中位数替换
* NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,NumberOfTime90DaysLate的最大值都是98，应该探索一下三者的相关性

In [None]:
train_df.info()

从以上结果可以看出：
* MonthlyIncome、NumberOfDependents具有缺失值
* age最小值为0，应该用中位数替换
* RevolvingUtilizationOfUnsecuredLines正常来讲取值应为0-1之间，而其最大值为50708，很有可能含有异常值
* NumberOfTime39-59DaysPastDueNotWorse、DebtRato、MonthlyIncome、NumberOfOpenCreditLinesAndLoans、NumberOfTimes90DaysLate、NumberRealEstateLoansOrLines、NumberOfTime60-89DaysPastDueNotWorse、NumberOfDependents列的最大值远大于四分之三位值，说明这些特征也很可能含有异常值
* 探索NumberOfTime39-59DaysPastDueNotWorse、NumberOfTimes90DaysLate、NumberOfTime60-89DaysPastDueNotWorse之间的相关性


# 3. 探索性数据分析（EDA）

**3.1 初步清洗**

In [None]:
#特征名重命名
colnames={'SeriousDlqin2yrs':'IsDlq',
         'RevolvingUtilizationOfUnsecuredLines':'Revol',
         'age':'Age',
         'NumberOfTime30-59DaysPastDueNotWorse':'Num30-59late',
         'NumberOfOpenCreditLinesAndLoans':'NumOpen',
         'NumberOfTimes90DaysLate':'Num90late',
         'NumberRealEstateLoansOrLines':'NumEstate',
         'NumberOfTime60-89DaysPastDueNotWorse':'Num60-89late',
         'NumberOfDependents':'NumDepend'}
train_df.rename(columns=colnames,inplace=True)
train_df.head()

In [None]:
#特征名重命名
colnames={'SeriousDlqin2yrs':'IsDlq',
         'RevolvingUtilizationOfUnsecuredLines':'Revol',
         'age':'Age',
         'NumberOfTime30-59DaysPastDueNotWorse':'Num30-59late',
         'NumberOfOpenCreditLinesAndLoans':'NumOpen',
         'NumberOfTimes90DaysLate':'Num90late',
         'NumberRealEstateLoansOrLines':'NumEstate',
         'NumberOfTime60-89DaysPastDueNotWorse':'Num60-89late',
         'NumberOfDependents':'NumDepend'}
test_df.rename(columns=colnames,inplace=True)
test_df.head()

**3.2 EDA分析**

**IsDlq**

In [None]:
sns.countplot('IsDlq',data=train_df)

In [None]:
badNum=train_df.loc[train_df['IsDlq']==1,:].shape[0]
goodNum=train_df.loc[train_df['IsDlq']==0,:].shape[0]
print('违约比:{0}%'.format(round(badNum*100/(goodNum+badNum),2)))

**训练集中，违约的贷款人占比约为6.68%，可以看出分类结果是极其不平衡的，而数据不平衡时，如果使用监督分类算法，算法会过多关注多数类，使分类性能下降。因此，后续采用回归模型和集成模型训练数据**

**Revol**

In [None]:
#Revol的数据分布
figure=plt.figure(figsize=(8,6))
plt.scatter(train_df['Revol'],train_df['Age'])
plt.grid()

In [None]:
fig=plt.figure(figsize=(10,6))
sns.boxplot(y=train_df['Revol'])

正常情况下,Revol的取值在0-1之间，超出1表明透支。但从右上的四分位图看出有相当一部分数据超出5000，猜测这是因为这部分数据没有除以分母信用卡额度。

用四分位数初步筛选出异常值：

In [None]:
#异常值情况
Q1 = np.percentile(train_df['Revol'].dropna(),25)  #revision:drop nan
Q3 = np.percentile(train_df['Revol'].dropna(),75)
IQR = Q3 - Q1 #the Interquarrile Range: box length
outlier_step = 1.5 * IQR
revol_lowlimit=Q1-outlier_step
revol_uplimit=Q3+outlier_step
print('异常值下限：{0}，异常值上限：{1}'.format(revol_lowlimit,revol_uplimit))

接下来将Revol的取值区间分为0-1与大于1两部分，查看数据分布情况

In [None]:
#将数据分为小于1和大于1的部分
data1=train_df.loc[train_df['Revol']<1,:]
data2=train_df.loc[train_df['Revol']>=1,:]
#查看两部分数据分布情况
fig=plt.figure(figsize=(20,6))
ax1=fig.add_subplot(1,2,1)
ax2=fig.add_subplot(1,2,2)
sns.distplot(data1['Revol'],ax=ax1,bins=10)
sns.distplot(data2['Revol'],ax=ax2,bins=10)

对大于1的数据部分进行细分：[1,100),[100,1000),[1000,10000),10000以上，并查看各个区间的分布

In [None]:
fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(2,2,figsize=(20,10))
sns.distplot(train_df.loc[(train_df['Revol']>=1)&(train_df['Revol']<100),'Revol'],bins=10,ax=ax1)
sns.distplot(train_df.loc[(train_df['Revol']>=100)&(train_df['Revol']<1000),'Revol'],bins=10,ax=ax2)
sns.distplot(train_df.loc[(train_df['Revol']>=1000)&(train_df['Revol']<10000),'Revol'],bins=10,ax=ax3)
sns.distplot(train_df.loc[train_df['Revol']>=10000,'Revol'],bins=10,ax=ax4)

可以看到，大于1的记录绝大部分集中在1-20之间，接下来，对以下区间分别查看违约率情况

In [None]:
#区间[0,1),[1,10),[10,20),[20,100),[100,1000),[1000,10000),10000以上的违约率情况
data_1=train_df.loc[(train_df['Revol']>=0)&(train_df['Revol']<1),:]
Is_1=data_1.loc[data_1['IsDlq']==1,:].shape[0]*100/data_1.shape[0]


data_2=train_df.loc[(train_df['Revol']>=1)&(train_df['Revol']<10),:]
Is_2=data_2.loc[data_2['IsDlq']==1,:].shape[0]*100/data_2.shape[0]


data_3=train_df.loc[(train_df['Revol']>=10)&(train_df['Revol']<20),:]
Is_3=data_3.loc[data_3['IsDlq']==1,:].shape[0]*100/data_3.shape[0]


data_4=train_df.loc[(train_df['Revol']>=20)&(train_df['Revol']<100),:]
Is_4=data_4.loc[data_4['IsDlq']==1,:].shape[0]*100/data_4.shape[0]

data_5=train_df.loc[(train_df['Revol']>=100)&(train_df['Revol']<1000),:]
Is_5=data_5.loc[data_5['IsDlq']==1,:].shape[0]*100/data_5.shape[0]

data_6=train_df.loc[(train_df['Revol']>=1000)&(train_df['Revol']<10000),:]
Is_6=data_6.loc[data_6['IsDlq']==1,:].shape[0]*100/data_6.shape[0]

data_7=train_df.loc[(train_df['Revol']>=10000),:]
Is_7=data_7.loc[data_7['IsDlq']==1,:].shape[0]*100/data_7.shape[0]

print('0-1违约率为：{0}%\n'.format(Is_1),
      '1-10违约率为：{0}%\n'.format(Is_2),
      '10-20违约率为：{0}%\n'.format(Is_3), 
      '20-100违约率为：{0}%\n'.format(Is_4),
      '100-1000违约率为：{0}%\n'.format(Is_5),
      '1000-10000违约率为：{0}%\n'.format(Is_6),
      '10000-51000违约率为：{0}%\n'.format(Is_7))

可以看出在Revol大于1时，违约率开始上升，10-20之间违约率达到峰值，超过20后开始下降，超过1000后开始恢复正常（与0-1的违约率一致），说明20左右的值可能为异常值上限的阈值。可以将Revol超过20的值都定义为异常值。

**Age**

In [None]:
#Age分布情况
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.distplot(train_df['Age'],ax=ax1)
sns.boxplot(y='Age',data=train_df,ax=ax2)

In [None]:
#异常值情况
Q1 = np.percentile(train_df['Age'].dropna(),25)  #revision:drop nan
Q3 = np.percentile(train_df['Age'].dropna(),75)
IQR = Q3 - Q1 #the Interquarrile Range: box length
outlier_step = 1.5 * IQR
age_lowlimit=Q1-outlier_step
age_uplimit=Q3+outlier_step
print('异常值下限：{0}，异常值上限：{1}'.format(age_lowlimit,age_uplimit))

从上图的分布来看，Age的分布基本符合正态分布，但仍含有异常值：年龄不可能为0岁，可以用中位数代替；年龄超过四分位数上限96岁的还是有一定的比例，年龄最大为109岁，可以判断为噪声。

基于常识，银行应该只能给成年人贷款，因此查看Age值小于18的数据：

In [None]:
train_df.loc[train_df['Age']<18]

从上述结果可以看到，只有一个记录Age值小于18，且为0，这是不可能的，我们用中位数替代。


查看退休与否的违约率情况

In [None]:
data_age=train_df.loc[train_df['Age']>=18,['Age','IsDlq']]
print(len(data_age))
data_age.loc[(data_age['Age']>=18)&(data_age['Age']<60),'Age'] = 1
data_age.loc[(data_age['Age']>=60),'Age'] = 2 
age_IsDlq=data_age.groupby('Age')['IsDlq'].sum()
age_total=data_age.groupby('Age')['IsDlq'].count()
age_Isratio=age_IsDlq/age_total
age_Isratio.plot(kind='bar',figsize=(8,6),color='#4682B4')

分组查看各年龄段违约率情况

In [None]:
data_age=train_df.loc[train_df['Age']>0,['Age','IsDlq']]
data_age.loc[(data_age['Age']>=20)&(data_age['Age']<30),'Age'] = 1
data_age.loc[(data_age['Age']>=30)&(data_age['Age']<40),'Age'] = 2 
data_age.loc[(data_age['Age']>=40)&(data_age['Age']<50),'Age'] = 3
data_age.loc[(data_age['Age']>=50)&(data_age['Age']<60),'Age'] = 4
data_age.loc[(data_age['Age']>=60)&(data_age['Age']<70),'Age'] = 5 
data_age.loc[(data_age['Age']>=70)&(data_age['Age']<80),'Age'] = 6
data_age.loc[(data_age['Age']>=80)&(data_age['Age']<90),'Age'] = 7
data_age.loc[(data_age['Age']>=90)&(data_age['Age']<100),'Age'] = 8 
data_age.loc[(data_age['Age']>=100),'Age'] = 9
age_Isdlq=data_age.groupby('Age')['IsDlq'].sum()
age_total=data_age.groupby('Age')['IsDlq'].count()
age_Isratio=age_Isdlq/age_total
age_Isratio.plot(kind='bar',figsize=(8,6))

可以看出20-30岁违约率最高，随着年龄增长，违约率降低。但是当年龄大于100时，违约率又显著提高了

**MonthlyIncome**

In [None]:
#MonthlyIncome数据分布
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.kdeplot(train_df['MonthlyIncome'],ax=ax1)
sns.boxplot(y='MonthlyIncome',data=train_df,ax=ax2)

In [None]:
#MonthlyIncome缺失值情况
M_null=train_df['MonthlyIncome'].isnull().sum()
print('缺失值数量：{0}，缺失值比率：{1}%'.format(M_null,M_null*100/train_df.shape[0]))

我们认为MonthlyIncome与Age(是否退休)的相关性可能较大，因此按照退休年龄划分数据集，并探索其与月收入的相关程度

In [None]:
#按照退休年龄划分数据集
working = train_df.loc[(train_df['Age'] >= 18) & (train_df['Age'] <= 60)]
senior = train_df.loc[(train_df['Age'] > 60)]
working_income_mean = working['MonthlyIncome'].mean()
senior_income_mean = senior['MonthlyIncome'].mean()
print("未退休月收入：{0},退休月收入：{1}".format(working_income_mean,senior_income_mean))

从上述结果可以看出，退休与否的月收入差距并不大，因此用月收入均值对MonthlyIncome的空值进行填充

**DebtRatio**

In [None]:
#DebtRatio数据的分布情况
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.kdeplot(train_df['DebtRatio'],ax=ax1)
sns.boxplot(y=train_df['DebtRatio'],ax=ax2)

由于DebtRatio取值范围跨度较大，因此先将取值区间分为0-1以及大于1，分组查看数据分布情况

In [None]:
Debt1=train_df.loc[train_df['DebtRatio']<1,:]
Debt2=train_df.loc[train_df['DebtRatio']>=1,:]
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.distplot(Debt1['DebtRatio'],ax=ax1)
sns.distplot(Debt2['DebtRatio'],ax=ax2)

In [None]:
#将大于1的取值区间细分
Debt3=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<1000),:]
Debt4=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<200),:]
Debt5=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<10),:]
Debt6=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<2),:]

fig,[[ax1,ax2],[ax3,ax4]]=plt.subplots(2,2,figsize=(20,10))
sns.distplot(Debt3['DebtRatio'],ax=ax1)
sns.distplot(Debt4['DebtRatio'],ax=ax2)
sns.distplot(Debt5['DebtRatio'],ax=ax3)
sns.distplot(Debt6['DebtRatio'],ax=ax4)

可以看到，在1-2区间，数据呈现较平滑趋势。查看以下各区间的违约率情况：

In [None]:
#各区间的违约率(0,1),(1-2),(2-10),(10-50),(50-200),(200,1000),1000以上
Debt_1=train_df.loc[(train_df['DebtRatio']>=0)&(train_df['DebtRatio']<1),:]
DebIs_1=Debt_1.loc[Debt_1['IsDlq']==1,:].shape[0]*100/Debt_1.shape[0]

Debt_2=train_df.loc[(train_df['DebtRatio']>=1)&(train_df['DebtRatio']<2),:]
DebIs_2=Debt_2.loc[Debt_2['IsDlq']==1,:].shape[0]*100/Debt_2.shape[0]

Debt_3=train_df.loc[(train_df['DebtRatio']>=2)&(train_df['DebtRatio']<10),:]
DebIs_3=Debt_3.loc[Debt_3['IsDlq']==1,:].shape[0]*100/Debt_3.shape[0]

Debt_4=train_df.loc[(train_df['DebtRatio']>=10)&(train_df['DebtRatio']<50),:]
DebIs_4=Debt_4.loc[Debt_4['IsDlq']==1,:].shape[0]*100/Debt_4.shape[0]

Debt_5=train_df.loc[(train_df['DebtRatio']>=50)&(train_df['DebtRatio']<200),:]
DebIs_5=Debt_5.loc[Debt_5['IsDlq']==1,:].shape[0]*100/Debt_5.shape[0]

Debt_6=train_df.loc[(train_df['DebtRatio']>=200)&(train_df['DebtRatio']<1000),:]
DebIs_6=Debt_6.loc[Debt_6['IsDlq']==1,:].shape[0]*100/Debt_6.shape[0]

Debt_7=train_df.loc[train_df['DebtRatio']>=1000,:]
DebIs_7=Debt_7.loc[Debt_7['IsDlq']==1,:].shape[0]*100/Debt_7.shape[0]

print('0-1违约率为：{0}%\n'.format(DebIs_1),
     '1-2违约率为：{0}%\n'.format(DebIs_2),
     '2-10违约率为：{0}%\n'.format(DebIs_3), 
     '10-50违约率为：{0}%\n'.format(DebIs_4),
     '50-200违约率为：{0}%\n'.format(DebIs_5),
     '200-1000违约率为：{0}%\n'.format(DebIs_6),
     '1000以上违约率为：{0}%\n'.format(DebIs_7))

我们看到1-2的违约率达到最高，超过2以后违约率开始稳定。因此在特征工程中，我们把2作为DebtRatio异常值上限的阈值，并把大于2的数据和0-1的数据进行合并。

**NumOpen**

In [None]:
#NumOpen的数据分布
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.distplot(train_df['NumOpen'],ax=ax1)
sns.boxplot(y=train_df['NumOpen'],ax=ax2)

In [None]:
#查看数据点分布
figure=plt.figure(figsize=(12,6))
sns.countplot(train_df['NumOpen'])

可以看出大于36的数据过少，把大于36的数据和36合并，并查看违约率情况

In [None]:
train_df.loc[train_df['NumOpen']>36,'NumOpen']=36
Numopen_dlq=train_df.groupby(['NumOpen'])['IsDlq'].sum()
Numopen_total=train_df.groupby(['NumOpen'])['IsDlq'].count()
Numopen_dlqratio=Numopen_dlq/Numopen_total
Numopen_dlqratio.plot(kind='bar',figsize=(12,6),color='#4682B4')

**NumEstate**

In [None]:
#NumEstate的数据分布
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.distplot(train_df['NumEstate'],ax=ax1)
sns.boxplot(y=train_df['NumEstate'],ax=ax2)

可以看到NumEstate>50的点为明显异常的点，应该删去

In [None]:
#查看数据点分布
sns.countplot(train_df['NumEstate'])

In [None]:
#将大于8的数据和8合并后看一下违约率的情况
train_df.loc[train_df['NumEstate']>8,'NumEstate']=8
Numestate_dlq=train_df.groupby(['NumEstate'])['IsDlq'].sum()
Numestate_total=train_df.groupby(['NumEstate'])['IsDlq'].count()
Numestate_dlqratio=Numestate_dlq/Numestate_total
Numestate_dlqratio.plot(kind='bar',figsize=(8,6),color='#4682B4')

从NumOpen和NumEstate的违约率分布可以看出，贷款数量为0时并不是违约率最低的，不动产贷款数量为1时违约率最低，随着贷款数量增加，违约率也随着增高。

**NumDepend**

In [None]:
#NumDepend数据分布
fig,[ax1,ax2]=plt.subplots(1,2,figsize=(20,6))
sns.countplot(train_df['NumDepend'],ax=ax1)
sns.boxplot(y=train_df['NumDepend'],ax=ax2)

In [None]:
#NumDepend缺失值情况
D_nullNum=train_df['NumDepend'].isnull().sum()
print('缺失值数量：{0}，缺失值比率：{1}%'.format(D_nullNum,D_nullNum*100/train_df.shape[0]))

查看MonthlyIncome和NumDepend的缺失是否有关联

In [None]:
train_df.loc[(train_df['NumDepend'].isnull())&(train_df['MonthlyIncome'].isnull()),:].shape[0]

可以看出缺失NumDepend值的记录也同样缺失MonthlyIncome。接下来查看NumDepend不缺失，MonthlyIncome缺失的数据分布

In [None]:
MNullDNot=train_df.loc[(train_df['NumDepend'].notnull())&(train_df['MonthlyIncome'].isnull()),:]
sns.countplot(MNullDNot['NumDepend'])

可以看出NumDepend不缺失，MonthlyIncome缺失的记录中NumDepend大多取值0，因此将NumDepend的缺失值填充为0

**Num30-59late、Num60-89late、Num90late**

In [None]:
fig,[ax1,ax2,ax3]=plt.subplots(1,3,figsize=(20,6))
sns.countplot(train_df['Num30-59late'],ax=ax1)
sns.countplot(train_df['Num60-89late'],ax=ax2)
sns.countplot(train_df['Num90late'],ax=ax3)

In [None]:
fig,[ax1,ax2,ax3]=plt.subplots(1,3,figsize=(20,6))
sns.boxplot(y='Num30-59late',data=train_df,ax=ax1)
sns.boxplot(y='Num60-89late',data=train_df,ax=ax2)
sns.boxplot(y='Num90late',data=train_df,ax=ax3)

从上图中可以看出大于20为明显的异常值

# 4. 数据清洗

**4.1 数据预处理**

根据上述EDA分析结果，需要对训练集进行以下处理：

**4.1.1 缺失值处理**

* MonthlyIncome：用均值填充缺失值
* NumDepend：用0填充缺失值

In [None]:
#NumDepend缺失值处理
train_df['NumDepend']=train_df['NumDepend'].fillna('0')

#MonthlyIncome缺失值处理
train_df['MonthlyIncome'] = train_df['MonthlyIncome'].replace(np.nan,train_df['MonthlyIncome'].mean())

**4.1.2 异常值处理**

* Age：Age为0的数据用中位数填充
* NumEstate：大于50为异常值,用中位数填充
* Num30-59late、Num60-89late、Num90late：大于20为异常值，用中位数填充

In [None]:
#Age异常值处理
train_df.loc[train_df['Age'] == 0, 'Age'] = train_df['Age'].median()

#Num30-59late Num60-89late Num90late异常值处理
train_df.loc[train_df['Num30-59late']>20,'Num30-59late']=train_df['Num30-59late'].median()
train_df.loc[train_df['Num60-89late']>20,'Num60-89late']=train_df['Num60-89late'].median()
train_df.loc[train_df['Num90late']>20,'Num90late']=train_df['Num90late'].median()

#NumEstate异常值处理
train_df.loc[train_df['NumEstate']>50,'NumEstate']=train_df['NumEstate'].median()

In [None]:
train_df.info()

In [None]:
#测试集数据处理
test_df['NumDepend']=test_df['NumDepend'].fillna('0')
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].replace(np.nan,test_df['MonthlyIncome'].mean())
test_df.loc[test_df['Age'] == 0, 'Age'] = test_df['Age'].median()
test_df.loc[test_df['Num30-59late']>20,'Num30-59late']=test_df['Num30-59late'].median()
test_df.loc[test_df['Num60-89late']>20,'Num60-89late']=test_df['Num60-89late'].median()
test_df.loc[test_df['Num90late']>20,'Num90late']=test_df['Num90late'].median()

#NumEstate异常值处理
test_df.loc[test_df['NumEstate']>50,'NumEstate']=test_df['NumEstate'].median()

In [None]:
test_df.info()

4.2 特征工程

**4.2.1 特征选择**

根据相关系数查看各变量相关性

In [None]:
corr=train_df.corr()
plt.figure(figsize=(14,12))
sns.heatmap(corr,annot=True,linewidths=.3)

从上图可以看出：
* IsDlq与Num30-59late、Num60-89late、Num90late相关性较大
* Num30-59late、Num60-89late、Num90late三者较为相关
* NumOpen与NumEstate相关性很大

**4.2.2 特征提取**

In [None]:
#衍生变量
train_df['AllNumlate']=train_df['Num30-59late']+train_df['Num60-89late']+train_df['Num90late']
train_df['AllLoans']=train_df['NumEstate']+train_df['NumOpen']

test_df['AllNumlate']=test_df['Num30-59late']+test_df['Num60-89late']+test_df['Num90late']
test_df['AllLoans']=test_df['NumEstate']+test_df['NumOpen']

# 5. 建立模型，预测求解

In [None]:
X_train_df = train_df.drop(['IsDlq'],axis=1)
y_train_df = train_df['IsDlq']
X_test_df = test_df.drop(['IsDlq'],axis=1)
y_test_df = test_df['IsDlq']

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# 划分训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X_train_df,y_train_df,random_state=111)
# Logistic模型建立
log = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)
#将数据集集标准化
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
#训练模型
log.fit(X_train_scaled, y_train)

# 6. 模型评估

In [None]:
# 定义画ROC图的函数
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")
    
# 输入训练集，返回每个样本对应到每种分类结果的概率
log_scores_proba = log.predict_proba(X_train_scaled)
# 返回IsDlq=1(违约)的概率
log_scores = log_scores_proba[:,1]
fpr_log, tpr_log, thresh_log = roc_curve(y_train, log_scores) #根据分类结果和分类概率，返回false positive rate和true positive rate

# 画ROC图
plot_roc_curve(fpr_log,tpr_log)
print('AUC Score : ', (roc_auc_score(y_train,log_scores)))

In [None]:
# 输入验证集，返回每个样本对应到每种分类结果的概率
log_scores_proba_val = log.predict_proba(X_test_scaled)
# 返回IsDlq=1(违约)的概率
log_scores_val = log_scores_proba_val[:,1]
fpr_log_val, tpr_log_val, thresh_logit_val = roc_curve(y_test, log_scores_val)

# 画图
plot_roc_curve(fpr_log_val,tpr_log_val)
print('AUC Score :', (roc_auc_score(y_test,log_scores_val)))

通过ROC曲线和AUC值看到，AUC的值为0.81，说明模型的区分能力还可以。

梯度提升法分类

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# 梯度提升树法
gbc_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf.fit(X_train,y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc,tpr_gbc)
print('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))

In [None]:
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc,tpr_gbc)
print('AUC Score :', roc_auc_score(y_test, gbc_val_scores))

# 7. 提交

In [None]:
submission_proba = gbc_clf.predict_proba(X_test_df)
submission_scores = submission_proba[:, 1]
submission_scores.shape
ids = np.arange(1, 101504)
submission = pd.DataFrame({
    'Id': ids, 
    'Probability': submission_scores
})
submission.to_csv('submission.csv', index=False)