In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 一、获取数据并观察

1. 导入其他必要的库

In [None]:
import matplotlib.pyplot as plt
from  sklearn.ensemble import RandomForestRegressor
# from pandas import Series,DataFrame
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

2. 直接读入的train_data最前面有Unnamed一列，值是索引，因此要去掉这一列

In [None]:
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
train_data = train_data.iloc[:,1:]
train_data.info()
train_data.head()

In [None]:
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')
test_data = test_data.iloc[:,1:]
test_data.info()
test_data.head()

查看变量分位数

In [None]:
train_data.describe([0.01, 0.10, 0.25, 0.5, 0.75, 0.90, 0.99]).T

# 二、数据预处理
这部分涉及缺失值处理、重复值处理和异常值处理
# 2.1 缺失值处理
缺失值的处理方法一般包括：
* 直接使用含有缺失值的属性（不处理）；
* 删除含有缺失值的属性；
* 删除含有缺失值的样本：`drop_null()`
* 缺失值补全：均值插补、建模预测等等
利用相似性填补：最大/最小/平均值

查看train集中的缺失数据

In [None]:
train_data.isnull().sum()

# 2.1.1 MonthlyIncome
缺失较多，用已知数据训练RandomForestRegressor并用它预测
* 对train集和test集分别处理（**因为test集中下标为1的列是NAN，从下标2开始取**）
* 训练集和测试集都不能包含最后一列（下标10）属性，否则会报错

In [None]:
# 对训练集处理
mData = train_data.iloc[:,[5,0,1,2,3,4,6,7,8,9]] # 加上10会报错
# iloc用下标获取数据，loc用列名称获取数据

train_known = mData[mData.MonthlyIncome.notnull()].values
train_unknown = mData[mData.MonthlyIncome.isnull()].values
train_X = train_known[:,1:]
train_y = train_known[:,0]

# 用已知的训练模型，预测未知数据
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(train_X,train_y)
predicted_y = rfr.predict(train_unknown[:,1:]).round(0) # 保留0位小数，取整

# 把未知的填充进去
train_data.loc[train_data.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted_y

In [None]:
# 对测试集处理
mData = test_data.iloc[:,[5,0,1,2,3,4,6,7,8,9]] 

test_known = mData[mData.MonthlyIncome.notnull()].values
test_unknown = mData[mData.MonthlyIncome.isnull()].values
test_X = test_known[:,2:] 
test_y = test_known[:,0]

# 用已知的训练模型，预测未知数据
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(test_X,test_y)
predicted_y = rfr.predict(test_unknown[:,2:]).round(0) # 保留0位小数，取整

# 把未知的填充进去
test_data.loc[test_data.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted_y

# 2.1.2 NumberOfDependents
缺失不多
* 训练集中可以直接删除，也可以用中值填充
* 测试集中用中值填充

In [None]:
# train_data = train_data.dropna()
# 对空值用中位数填充
train_data['NumberOfDependents'].fillna(train_data['NumberOfDependents'].median(), inplace=True)

test_data['NumberOfDependents'].fillna(test_data['NumberOfDependents'].median(), inplace=True)
# test_data.shape

结果：缺失值处理后的训练集和测试集

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

# 2.2 重复值处理
直接删除

In [None]:
train_data = train_data.drop_duplicates()

# 2.3 异常值处理

# 2.3.1 发现异常值
查看是否有异常值（不在$[Q1-1.5IQR, Q3+1.5IQR]$内的值），离群点检测

In [None]:
train_data.describe()

In [None]:
test_data.describe()

发现有如下问题：
1. 存在非法数据：属性age：train集中存在0
2. 存在很大的异常值
* 属性DebtRatio：train集和test集中均有很大（329664；268326）的异常值
* 属性NumberOfDependents：train集和test集中均有很大（20；43）的异常值
* 属性MonthlyIncome：train集和test集中均有很大（10e+6）的异常值
* 属性NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate：train集和test集中均有98异常值
* 属性NumberRealEstateLoansOrLines：train集和test集中均有很大（54；37）的异常值
* 属性RevolvingUtilizationOfUnsecuredLines：train集和test集中均有很大（50708；21821）的异常值
* 属性NumberOfOpenCreditLinesAndLoans：有较大的异常值

# 2.3.2 查看变量相关性

In [None]:
corr = train_data.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

发现NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate三者相关性较大，同时对预测值SeriousDlqin2yrs的影响也较大

# 2.3.3 查看分类结果

In [None]:
# 对分类结果SeriousDlqin2yrs查看
sns.countplot(x="SeriousDlqin2yrs",data=train_data)

发现分类结果是及其不平衡的，数据不平衡会让监督学习算法过多关注多数类，使分类性能下降；因为数据足够多，采用欠采样；采用正则回归模型和集成模型。

# 2.3.4 删除或替换异常值

1. train集中的age：必须大于0，但是存在为0的值，直接删除异常值

In [None]:
train_data = train_data[train_data.age>0]

2. train集中的DebtRatio：先用箱线图查看点分布，再用99%分位数进行盖帽处理

In [None]:
datatemp1=train_data["DebtRatio"]
datatemp1.plot(kind='box',title='DebtRatio Distribution',sym='r+');
train_data=train_data[train_data['DebtRatio']<4979.05]

3. train集中的NumberOfDependents处理

In [None]:
datatemp2=train_data["NumberOfDependents"]
datatemp2.plot(kind='box',title='NumberOfDependents Distribution',sym='r+');
train_data=train_data[train_data['NumberOfDependents']<4.00]

4. train集中的MonthlyIncome处理

In [None]:
datatemp3=train_data["MonthlyIncome"]
datatemp3.plot(kind='box',title='MonthlyIncome Distribution',sym='r+');
train_data=train_data[train_data['MonthlyIncome']<25000.00]

5.  train集 & test集中的
* NumberOfTime30-59DaysPastDueNotWorse
* NumberOfTime60-89DaysPastDueNotWorse
* NumberOfTimes90DaysLate

**作图查看分布**（发现三个属性都有相同的两组异常值，因此只需要针对某个属性删除即可）

In [None]:
# 简单的做图与处理
train_box = train_data.iloc[:,[3,7,9]]
train_box.boxplot() 

In [None]:
# 复杂的做图
fig,axes = plt.subplots(1,3)
color = dict(boxes='DarkGreen', whiskers='DarkOrange',
              medians='DarkBlue', caps='Red')
# boxes表示箱体，whisker表示触须线
# medians表示中位数，caps表示最大与最小值界限

datatemp1=train_data[["NumberOfTime30-59DaysPastDueNotWorse","NumberOfTimes90DaysLate","NumberOfTime60-89DaysPastDueNotWorse"]]
datatemp1.plot(kind='box',ax=axes,subplots=True,
              title='3 Different boxplots',color=color,sym='r+')# sym参数表示异常值标记的方式
axes[0].set_ylabel('NumberOfTime30-59DaysPastDueNotWorse')
axes[1].set_ylabel('NumberOfTimes90DaysLate')
axes[2].set_ylabel('NumberOfTime60-89DaysPastDueNotWorse')

fig.subplots_adjust(wspace=3,hspace=1)  # 调整子图之间的间距

# 查看上述三个变量的不重复值。
print(np.unique(datatemp1["NumberOfTime30-59DaysPastDueNotWorse"]))
print(np.unique(datatemp1["NumberOfTimes90DaysLate"]))
print(np.unique(datatemp1["NumberOfTime60-89DaysPastDueNotWorse"]))

train集中：直接删除

In [None]:
train_data = train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse']<4.00]

test集中：如果大于96，就用中值填充，否则就保留原来的值

In [None]:
masktest = (test_data['NumberOfOpenCreditLinesAndLoans'] == 0) & (test_data['NumberRealEstateLoansOrLines'] == 0)
#信贷数量，固定资产贷款数量

test_data['NumberOfTimes90DaysLate'] = (
    test_data['NumberOfTimes90DaysLate']
    .apply(lambda x: int(test_data[masktest]['NumberOfTimes90DaysLate'].median()) if x >= 96 else x)
)

6. train集中的NumberRealEstateLoansOrLines

In [None]:
datatemp5=train_data["NumberRealEstateLoansOrLines"]
datatemp5.plot(kind='box',title='NumberRealEstateLoansOrLines Distribution',sym='r+');

In [None]:
train_data = train_data[train_data['NumberRealEstateLoansOrLines'] < 4.00]

7. train集中的RevolvingUtilizationOfUnsecuredLines

In [None]:
datatemp6=train_data["RevolvingUtilizationOfUnsecuredLines"]
datatemp6.plot(kind='box',title='RevolvingUtilizationOfUnsecuredLines Distribution',sym='r+');

In [None]:
train_df = train_data[train_data['RevolvingUtilizationOfUnsecuredLines'] < 1.09]
train_df = train_data[train_data['NumberOfOpenCreditLinesAndLoans'] < 24.00]
train_df = train_data[train_data['age'] < 87.00]

# 查看处理后的结果

In [None]:
# 大致数据分布情况
train_data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
train_data.describe()

In [None]:
test_data.describe()

# 2.4. 数据切分
为了使得能够更好地检验模型效果，我们将数据切分化为训练集和测试集。测试集取原数据的30%

In [None]:
from sklearn.model_selection import train_test_split
y = train_data.iloc[:,0]
X = train_data.iloc[:,1:]
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size =0.3,random_state=0)
ntrain_data = pd.concat([train_y,train_X],axis=1)
ntest_data = pd.concat([test_y,test_X],axis=1)

# 三、探索性分析

查看数据分布，判断是否符合统计假设
* monthlyIncome如下，符合正态分布

In [None]:
plt.figure(figsize=(15,5))
plt.hist(ntrain_data.MonthlyIncome,bins=70,alpha=0.8,rwidth=0.9)
plt.title("MonthlyIncome distribution")
plt.ylabel('value of MonthlyIncome', fontsize=12)
plt.xlabel('MonthlyIncome', fontsize=12)
plt.show()

In [None]:
MI = ntrain_data['MonthlyIncome']
sns.distplot(MI)

* age如下，符合正态分布

In [None]:
plt.figure(figsize=(15,5))
plt.hist(ntrain_data.age,bins=50,alpha=0.8,rwidth=0.9)
plt.title("age distribution")
plt.ylabel('value of age', fontsize=12)
plt.xlabel('age', fontsize=12)
plt.show()

In [None]:
age = ntrain_data['age']
sns.distplot(age)

# 四、变量选择
# 4.1 分箱处理
常见分箱方法有：
1.  等距分段（间隔固定）
2.  等深分段（每组数量固定）
3.  最优分段（将连续的变量分为一段）【有重复值，需要修改qcut函数中duplicate参数】

In [None]:
# 分箱函数
def mono_bin(Y, X, n=10):
    r = 0
    good=Y.sum()
    bad=Y.count()-good
    while np.abs(r) < 1: 
#         d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)})
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates="drop")})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)  
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min']=d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe']=np.log((d3['rate']/good)/((1-d3['rate'])/bad))
    d3['goodattribute']=d3['sum']/good
    d3['badattribute']=(d3['total']-d3['sum'])/bad
    iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)
#     不能用sort_index函数，没有by这个关键字
    woe=list(d4['woe'].round(3))
    cut=[]
    cut.append(float('-inf'))
    for i in range(1,n+1):
        qua=X.quantile(i/(n+1))
        cut.append(round(qua,4))
    cut.append(float('inf'))
    return d4,iv,cut,woe

# 4.1.1 连续变量

优先选择最优分段方法，在连续变量的分布不满足最优分段的要求时，再考虑对连续变量进行等距分段。

In [None]:
x1_d,x1_iv,x1_cut,x1_woe = mono_bin(train_y,train_X.RevolvingUtilizationOfUnsecuredLines)
x2_d,x2_iv,x2_cut,x2_woe = mono_bin(train_y,train_X.age)
x4_d,x4_iv,x4_cut,x4_woe = mono_bin(train_y,train_X.DebtRatio)
x5_d,x5_iv,x5_cut,x5_woe = mono_bin(train_y,train_X.MonthlyIncome)

# 4.1.2 离散变量
无法通过这种方式分箱，使用人工选择的方式进行

**NumberOfTime30-59DaysPastDueNotWorse：**
cutx3 = [-inf, 0, 1, 3, 5, +inf] 

In [None]:
def woe_value(d1):
    d2 = d1.groupby('Bucket', as_index = True)
    good=train_y.sum()
    bad=train_y.count()-good
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min']=d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate']/good)/((1-d3['rate'])/bad))
    d3['goodattribute']=d3['sum']/good
    d3['badattribute']=(d3['total']-d3['sum'])/bad
    iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)
    woe=list(d4['woe'].round(3))
    return d4,iv,woe

In [None]:
# 下标3：变量NumberOfTime30-59DaysPastDueNotWorse
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=0)]
d1_x1.loc[:,'Bucket']="(-inf,0]"
 
d1_x2 = d1.loc[(d1['Bucket']>0) & (d1['Bucket']<= 1)]
d1_x2.loc[:,'Bucket'] = "(0,1]"
 
d1_x3 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 3)]
d1_x3.loc[:,'Bucket'] = "(1,3]"
 
d1_x4 = d1.loc[(d1['Bucket']>3) & (d1['Bucket']<= 5)]
d1_x4.loc[:,'Bucket'] = "(3,5]"
 
d1_x5 = d1.loc[(d1['Bucket']>5)]
d1_x5.loc[:,'Bucket']="(5,+inf)"
d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4,d1_x5])
 
x3_d,x3_iv,x3_woe= woe_value(d1)
x3_cut = [float('-inf'),0,1,3,5,float('+inf')]

**NumberOfOpenCreditLineAndLoans:**
cutx6 = [-inf, 1, 2, 3, 5, +inf]

In [None]:
# 下标6：变量NumberOfOpenCreditLineAndLoans
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=1)]
d1_x1.loc[:,'Bucket']="(-inf,1]"
 
d1_x2 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 2)]
d1_x2.loc[:,'Bucket'] = "(1,2]"
 
d1_x3 = d1.loc[(d1['Bucket']>2) & (d1['Bucket']<= 3)]
d1_x3.loc[:,'Bucket'] = "(2,3]"
 
d1_x4 = d1.loc[(d1['Bucket']>3) & (d1['Bucket']<= 5)]
d1_x4.loc[:,'Bucket'] = "(3,5]"
 
d1_x5 = d1.loc[(d1['Bucket']>5)]
d1_x5.loc[:,'Bucket']="(5,+inf)"
d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4,d1_x5])
 
x6_d,x6_iv,x6_woe= woe_value(d1)
x6_cut = [float('-inf'),1,2,3,5,float('+inf')]

**NumberOfTimes90DaysLate:**
cutx7 = [-inf, 0, 1, 3, 5, +inf]

In [None]:
# 下标7：变量NumberOfTimes90DaysLate
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=0)]
d1_x1.loc[:,'Bucket']="(-inf,0]"
 
d1_x2 = d1.loc[(d1['Bucket']>0) & (d1['Bucket']<= 1)]
d1_x2.loc[:,'Bucket'] = "(0,1]"
 
d1_x3 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 3)]
d1_x3.loc[:,'Bucket'] = "(1,3]"
 
d1_x4 = d1.loc[(d1['Bucket']>3) & (d1['Bucket']<= 5)]
d1_x4.loc[:,'Bucket'] = "(3,5]"
 
d1_x5 = d1.loc[(d1['Bucket']>5)]
d1_x5.loc[:,'Bucket']="(5,+inf)"
d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4,d1_x5])
 
x7_d,x7_iv,x7_woe= woe_value(d1)
x7_cut = [float('-inf'),0,1,3,5,float('+inf')]

**NumberRealEstateLoanOrLines:**
cutx8 = [-inf, 0,1,2, 3, +inf]

In [None]:
# 下标8：变量NumberRealEstateLoanOrLines
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=0)]
d1_x1.loc[:,'Bucket']="(-inf,0]"
 
d1_x2 = d1.loc[(d1['Bucket']>0) & (d1['Bucket']<= 1)]
d1_x2.loc[:,'Bucket'] = "(0,1]"
 
d1_x3 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 2)]
d1_x3.loc[:,'Bucket'] = "(1,2]"
 
d1_x4 = d1.loc[(d1['Bucket']>2) & (d1['Bucket']<= 3)]
d1_x4.loc[:,'Bucket'] = "(2,3]"
 
d1_x5 = d1.loc[(d1['Bucket']>3)]
d1_x5.loc[:,'Bucket']="(3,+inf)"

d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4,d1_x5])
 
x8_d,x8_iv,x8_woe= woe_value(d1)
x8_cut = [float('-inf'),0,1,2,3,float('+inf')]

**NumberOfTime60-89DaysPastDueNotWorse：**
cutx9 = [-inf, 0, 1, 3, +inf]

In [None]:
# 下标9：变量NumberOfTime60-89DaysPastDueNotWorse
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=0)]
d1_x1.loc[:,'Bucket']="(-inf,0]"
 
d1_x2 = d1.loc[(d1['Bucket']>0) & (d1['Bucket']<= 1)]
d1_x2.loc[:,'Bucket'] = "(0,1]"
 
d1_x3 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 3)]
d1_x3.loc[:,'Bucket'] = "(1,3]"
 
d1_x4 = d1.loc[(d1['Bucket']>3)]
d1_x4.loc[:,'Bucket'] = "(3,+inf)"
 
d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4])
 
x9_d,x9_iv,x9_woe= woe_value(d1)
x9_cut = [float('-inf'),0,1,3,float('+inf')]

**NumberOfDependents:**
cutx10 = [-inf, 0, 1, 2, 3, 5, +inf]

In [None]:
# 下标10：变量NNumberOfDependents
d1 = pd.DataFrame({"X": train_X['NumberOfTime30-59DaysPastDueNotWorse'], "Y": train_y})
d1['Bucket'] = d1['X']
d1_x1 = d1.loc[(d1['Bucket']<=0)]
d1_x1.loc[:,'Bucket']="(-inf,0]"
 
d1_x2 = d1.loc[(d1['Bucket']>0) & (d1['Bucket']<= 1)]
d1_x2.loc[:,'Bucket'] = "(0,1]"
 
d1_x3 = d1.loc[(d1['Bucket']>1) & (d1['Bucket']<= 2)]
d1_x3.loc[:,'Bucket'] = "(1,2]"
 
d1_x4 = d1.loc[(d1['Bucket']>2) & (d1['Bucket']<= 3)]
d1_x4.loc[:,'Bucket'] = "(2,3]"
 
d1_x5 = d1.loc[(d1['Bucket']>3) & (d1['Bucket']<= 5)]
d1_x5.loc[:,'Bucket']="(3,5]"

d1_x6 = d1.loc[(d1['Bucket']>5) ]
d1_x6.loc[:,'Bucket']="(5,+inf)"
d1 = pd.concat([d1_x1,d1_x2,d1_x3,d1_x4,d1_x5,d1_x6])
 
x10_d,x10_iv,x10_woe= woe_value(d1)
x10_cut = [float('-inf'),0,1,2,3,5,float('+inf')]

在分箱的过程中，同时计算了WOE（Weight of Evidence）和IV(Information Value)
* 前者在建立逻辑回归模型是需要将所有的变量转为WOE
* 后者则可以很好的展示变量的预测能力。
之前检查过变量之间的相关性，NumberOfTime30-59DaysPastDueNotWorse,NumberOfOpenCreditLinesAndLoans和NumberOfTime60-89DaysPastDueNotWorse这三个特征对于我们所要预测的值有较大影响

In [None]:
# 再次显示变量之间的相关性
corr = train_data.corr()
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
yticks = list(corr.index)
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 5,  'color': 'blue'})
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()

In [None]:
# 查看各个变量的IV值
informationValue = []
informationValue.append(x1_iv)
informationValue.append(x2_iv)
informationValue.append(x3_iv)
informationValue.append(x4_iv)
informationValue.append(x5_iv)
informationValue.append(x6_iv)
informationValue.append(x7_iv)
informationValue.append(x8_iv)
informationValue.append(x9_iv)
informationValue.append(x10_iv)
informationValue
 
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
index_num = range(len(index))
ax=plt.bar(index_num,informationValue,tick_label=index)
plt.show()

通过IV值判断变量预测能力的标准是：

* 小于0.02: unpredictive

* 0.02 to 0.1: weak

* 0.1 to 0.3: medium

* 0.3 to 0.5: strong

* 大于0.5: suspicious
可以看到，对于X4，X5的IV值都比较低（但是理论上X6，X8，以及X10也应该比较低）（？哪里处理不对吗）

舍弃这些预言能力较差的特征
<!-- 可以看到，对于X4，X5的IV值都比较低，因此可以舍弃这些预言能力较差的特征 -->
# 4.2 WOE转换
将所有的需要的特征woe化，并将不需要的特征舍弃，仅保留WOE转码后的变量

In [None]:
def trans_woe(var,var_name,x_woe,x_cut):
    woe_name = var_name + '_woe'
    for i in range(len(x_woe)):
        if i == 0:
            var.loc[(var[var_name]<=x_cut[i+1]),woe_name] = x_woe[i]
        elif (i>0) and (i<= len(x_woe)-2):
            var.loc[((var[var_name]>x_cut[i])&(var[var_name]<=x_cut[i+1])),woe_name] = x_woe[i]
        else:
            var.loc[(var[var_name]>x_cut[len(x_woe)-1]),woe_name] = x_woe[len(x_woe)-1]
    return var
 
x1_name = 'RevolvingUtilizationOfUnsecuredLines'
x2_name = 'age'
x3_name = 'NumberOfTime30-59DaysPastDueNotWorse'
x7_name = 'NumberOfTimes90DaysLate'
x9_name = 'NumberOfTime60-89DaysPastDueNotWorse'
 
train_X = trans_woe(train_X,x1_name,x1_woe,x1_cut)
train_X = trans_woe(train_X,x2_name,x2_woe,x2_cut)
train_X = trans_woe(train_X,x3_name,x3_woe,x3_cut)
train_X = trans_woe(train_X,x7_name,x7_woe,x7_cut)
train_X = trans_woe(train_X,x9_name,x9_woe,x9_cut)

In [None]:
train_X = train_X.iloc[:,-5:]
train_X

# 五、模型分析

In [None]:
# 为了避免和交叉验证混淆，将train和test设定为其他名称
X = train_data.drop(['SeriousDlqin2yrs'],axis=1)
y = train_data['SeriousDlqin2yrs']
W = test_data.drop(['SeriousDlqin2yrs'],axis=1)
z = test_data['SeriousDlqin2yrs']

# 5.1线性回归分类

In [None]:
# 用线性回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=111)

# 调用线性回归函数，C为正则化系数，l1表示L1正则化
logit = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

# 标准化拟合
scaler = StandardScaler().fit(X_train)

# 标准化X_train 和X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 线性回归拟合
logit.fit(X_train_scaled, y_train)

# 输入训练集，返回每个样本对应到每种分类结果的概率
logit_scores_proba = logit.predict_proba(X_train_scaled)

# 返回分类1的概率
logit_scores = logit_scores_proba[:,1]

查看结果

In [None]:
# 画图
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print('AUC Score : ', (roc_auc_score(y_train,logit_scores)))

# 5.2 优化正则化系数
* 采用LogisticRegressionCV来交叉验证选择正则化系数C

In [None]:
from sklearn.linear_model import LogisticRegressionCV
logit = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100], penalty='l1', solver='saga', max_iter=500, class_weight='balanced', random_state=111)

# 线性回归拟合
logit.fit(X_train_scaled, y_train)

print(logit.C_)

In [None]:
# 输入训练集，返回每个样本对应到每种分类结果的概率
logit_scores_proba = logit.predict_proba(X_train_scaled)

# 返回分类1的概率
logit_scores = logit_scores_proba[:,1]

# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print('AUC Score : ', (roc_auc_score(y_train,logit_scores)))

之前是0.817现在反而变成了0.816，虽然采用了balanced权重，但是效果还是不理想

# 5.3降采样处理

In [None]:
# 引入降采样模块
from imblearn.under_sampling import RandomUnderSampler

# Counter类的目的是用来跟踪值出现的次数
from collections import Counter
print('Original dataset shape :', Counter(y))

In [None]:
# 调用模块
rus = RandomUnderSampler(random_state=111)

# 直接降采样后返回采样后的数值
X_resampled, y_resampled = rus.fit_resample(X, y)
print('Resampled dataset shape:', Counter(y_resampled))

In [None]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_resampled, y_resampled, random_state=111)
X_train_rus.shape, y_train_rus.shape

In [None]:
# 对重采样以后的数据进行分类
logit_resampled = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

logit_resampled.fit(X_resampled, y_resampled)
logit_resampled_proba_res = logit_resampled.predict_proba(X_resampled)
logit_resampled_scores = logit_resampled_proba_res[:, 1]
fpr_logit_resampled, tpr_logit_resampled, thresh_logit_resampled = roc_curve(y_resampled, logit_resampled_scores)
plot_roc_curve(fpr_logit_resampled, tpr_logit_resampled)
print('AUC score: ', roc_auc_score(y_resampled, logit_resampled_scores))

发现准确率更低了
# 5.4 随机森林分类

In [None]:
# 采用随机森林法分类和梯度上升法
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
forest = RandomForestClassifier(n_estimators=300, random_state=111, max_depth=5, class_weight='balanced')
forest.fit(X_train_rus, y_train_rus)
y_scores_prob = forest.predict_proba(X_train_rus)
y_scores = y_scores_prob[:, 1]
fpr, tpr, thresh = roc_curve(y_train_rus, y_scores)
plot_roc_curve(fpr, tpr)
print('AUC score:', roc_auc_score(y_train_rus, y_scores))

In [None]:
# 交叉验证
y_test_proba = forest.predict_proba(X_test_rus)
y_scores_test = y_test_proba[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(y_test_rus, y_scores_test)
plot_roc_curve(fpr_test, tpr_test)
print('AUC Score:', roc_auc_score(y_test_rus, y_scores_test))

In [None]:
# 看看随机森林法对各个特征的重视程度
def plot_feature_importances(model):
    plt.figure(figsize=(10,8))
    n_features = X.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.ylim(-1, n_features)

plot_feature_importances(forest)

# 5.5 梯度提升法分类

In [None]:
# 看一下梯度提升树法
gbc_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=8, random_state=112)
gbc_clf.fit(X_train, y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:, 1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print('AUC Score:', roc_auc_score(y_train, gbc_clf_scores))

提督提升法分类的结果是最好的

In [None]:
# 查看交叉验证的结果
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:, 1]
print('AUC score:', roc_auc_score(y_test, gbc_val_scores))

调节参数

In [None]:
gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))
print('AUC Score :', roc_auc_score(y_test, gbc_val_scores))

In [None]:
plot_feature_importances(gbc_clf)

# 5.6 STATSMODEL包来建立逻辑回归模型

In [None]:
import statsmodels.api as sm
X1=sm.add_constant(train_X)
logit=sm.Logit(train_y,X1)
result=logit.fit()
print(result.summary())

导入测试集的数据，画出ROC曲线来判断模型的准确性
1. 对测试集进行woe转化

In [None]:
test_X = trans_woe(test_X,x1_name,x1_woe,x1_cut)
test_X = trans_woe(test_X,x2_name,x2_woe,x2_cut)
test_X = trans_woe(test_X,x3_name,x3_woe,x3_cut)
test_X = trans_woe(test_X,x7_name,x7_woe,x7_cut)
test_X = trans_woe(test_X,x9_name,x9_woe,x9_cut)
 
test_X = test_X.iloc[:,-5:]

2. 拟合模型，画出ROC曲线得到AUC值

In [None]:
from sklearn import metrics
X3 = sm.add_constant(test_X)
resu = result.predict(X3)
fpr, tpr, threshold = metrics.roc_curve(test_y, resu)
rocauc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()

准确率85%，还可以

# 六、建立评分卡
# 6.1 评分标准
$a=log \frac{p_{good}}{P_{bad}}$

$Score = offset + factor * log(odds)$

在建立标准评分卡之前，我们需要选取几个评分卡参数：基础分值、 PDO（比率翻倍的分值）和好坏比。 这里， 我们取600分为基础分值，PDO为20 （每高20分好坏比翻一倍），好坏比取20。
# 6.2 建立评分卡

In [None]:
# 建立评分卡
p = 20/np.log(2)
q = 600 - 20*np.log(20)/np.log(2)
 
def get_score(coe,woe,factor):
    scores=[]
    for w in woe:
        score=round(coe*w*factor,0)
        scores.append(score)
    return scores
 
x_coe = [2.6084,0.6327,0.5151,0.5520,0.5747,0.4074]
baseScore = round(q + p * x_coe[0], 0)

In [None]:
x1_score = get_score(x_coe[1], x1_woe, p)
x2_score = get_score(x_coe[2], x2_woe, p)
x3_score = get_score(x_coe[3], x3_woe, p)
x7_score = get_score(x_coe[4], x7_woe, p)
x9_score = get_score(x_coe[5], x9_woe, p)

# 6.3 计算评分

In [None]:
import copy
cut_t = [x1_cut,x2_cut,x3_cut,x7_cut,x9_cut]
def compute_score(x):
    #x为数组，包含x1,x2,x3,x7和x9的取值
    tot_score = baseScore
    cut_d = copy.deepcopy(cut_t)
    for j in range(len(cut_d)):
        cut_d[j].append(x[j])
        cut_d[j].sort()
        for i in range(len(cut_d[j])):
            if cut_d[j][i] == x[j]:
                tot_score = score[j][i-1] +tot_score
    return tot_score

In [None]:
# x_score=[0.3, 44, 3, 3, 5]
# compute_score(x_score)

# 七、数据输出

In [None]:
submission_proba = gbc_clf_submission.predict_proba(W)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
W.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)