In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

from  sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt #作图
import seaborn as sns

# 一、数据导入

In [None]:
# 读取训练集和测试集数据
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')

In [None]:
print(train_data.info())
train_data.head(5)

In [None]:
print(test_data.info())
test_data.head(5)

# 二、清洗数据

In [None]:
data = train_data

In [None]:
data.describe()

In [None]:
# 列重命名
data.rename(columns={'Unnamed: 0':'ID'}, inplace=True)

## 1、缺失值处理

缺失值指的是现有数据集中某个或某些属性的值是不完全的。

缺失值的处理方法一般包括：
1. 直接使用含有缺失值的属性（不处理）；
2. 删除含有缺失值的属性（该方法在包含缺失值的属性仅仅包含**极少量**有效值时是有效的）；
3. 直接删除含有缺失值的样本；
4. 缺失值补全：均值插补、建模预测等等

In [None]:
data.isnull().sum()

可以看出
1. 变量MonthlyIncome 缺失值较多，不能直接删除样本，同时缺失值也没有多到能直接删除属性，所以需要补全缺失值，这里使用随机森林预测
2. 变量NumberOfDependents的缺失值较少，这里就直接删除含缺失值的样本

In [None]:
# 用随机森林对缺失值预测填充函数
def set_missing(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11]]
    # 分成已知该特征和未知该特征两部分
    known = process_df[process_df.MonthlyIncome.notnull()].values
    unknown = process_df[process_df.MonthlyIncome.isnull()].values
    # X为特征属性值
    X = known[:, [1,2,3,4,5,7,8,9,10]]
    # y为结果标签值
    y = known[:,[6]]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, 
    n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:,[1,2,3,4,5,7,8,9,10]]).round(0)
    print(predicted)
    # 用得到的预测结果填补原缺失数据
    print(len(predicted))
    df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
    return df

In [None]:
data=set_missing(data)#用随机森林填补比较多的缺失值
data=data.dropna()#删除比较少的缺失值
data=data.drop_duplicates()#删除重复项
# data.to_csv(r'MissingData.csv',index=False)
data.shape

## 2、异常值处理 

异常值指在数据集中存在的不合理的值，又称离群点，比如年龄小于0，或者不符合正态分布的数据。

异常值的处理方法一般包括：
1. 删除含有异常值的样本
2. 将异常值视为缺失值，应用缺失值处理方法
3. 用平均值来修正
4. 不处理

In [None]:
data.describe()

可以从这个结果看到年龄的最小值为0，是不合理的值，且这样的样本很少，所以直接删除异常的样本

In [None]:
data = data[data['age']> 0]

In [None]:
data.hist(bins=50, figsize=(20,15))
plt.show()

可以看出DebtRatio、NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate、NumberRealEstateLoansOrLines、RevolvingUtilizationOfUnsecuredLines的数据分布情况比较异常，应该有一些极端的数值影响了分布图像的呈现。下面分别用箱型图查看一下数值的分布，删去极端值。

### 2.1 DebtRatio

In [None]:
datatemp2=data["DebtRatio"]
datatemp2.plot(kind='box',title='DebtRatio Distribution',sym='r+');

In [None]:
# DebtRatio异常值的数量
print(data[data['DebtRatio'] > 8000].count()) 

可以看出（相较于150000的总样本数量）DebtRatio的异常值很少，不影响整体数据，于是选择删去

In [None]:
data = data[data['DebtRatio'] < 8000]

### 2.2 MonthlyIncome

In [None]:
datatemp3=data["MonthlyIncome"]
datatemp3.plot(kind='box',title='MonthlyIncome Distribution',sym='r+');

In [None]:
print(data[data['MonthlyIncome'] > 50000].count()) 

同理，MonthlyIncome的异常值也较少，删去。

In [None]:
data = data[data['MonthlyIncome'] < 50000]

### 2.3 NumberOfDependents

In [None]:
datatemp4=data["NumberOfDependents"]
datatemp4.plot(kind='box',title='NumberOfDependents Distribution',sym='r+');

In [None]:
print(data[data['NumberOfDependents'] > 10].count()) 

同理，删去。

In [None]:
data = data[data['NumberOfDependents'] < 10]

### 2.4 NumberOfTime30-59DaysPastDueNotWorse

In [None]:
datatemp5=data["NumberOfTime30-59DaysPastDueNotWorse"]
datatemp5.plot(kind='box',title='NumberOfTime30-59DaysPastDueNotWorse Distribution',sym='r+');

In [None]:
print(data[data['NumberOfTime30-59DaysPastDueNotWorse'] > 20].count())

同理，删去。

In [None]:
#剔除异常值
data = data[data['NumberOfTime30-59DaysPastDueNotWorse'] < 20]

### 2.5 NumberOfTime60-89DaysPastDueNotWorse

In [None]:
datatemp6=data["NumberOfTime60-89DaysPastDueNotWorse"]
datatemp6.plot(kind='box',title='NumberOfTime60-89DaysPastDueNotWorse Distribution',sym='r+');

分布正常，可能是前面的操作删除了该属性的异常值样本。

### 2.6 NumberOfTimes90DaysLate

In [None]:
datatemp7=data["NumberOfTimes90DaysLate"]
datatemp7.plot(kind='box',title='NumberOfTimes90DaysLate Distribution',sym='r+');

分布正常，可能是前面的操作删除了该属性的异常值样本。

### 2.7 NumberRealEstateLoansOrLines

In [None]:
datatemp8=data["NumberRealEstateLoansOrLines"]
datatemp8.plot(kind='box',title='NumberRealEstateLoansOrLines Distribution',sym='r+');

In [None]:
print(data[data['NumberRealEstateLoansOrLines'] > 30].count())

同理，删去。

In [None]:
#剔除异常值
data = data[data['NumberRealEstateLoansOrLines'] < 30]

### 2.8 RevolvingUtilizationOfUnsecuredLines

In [None]:
datatemp9=data["RevolvingUtilizationOfUnsecuredLines"]
datatemp9.plot(kind='box',title='RevolvingUtilizationOfUnsecuredLines Distribution',sym='r+');

In [None]:
print(data[data['RevolvingUtilizationOfUnsecuredLines'] > 3].count()) 

同理，删去。

In [None]:
data = data[data['RevolvingUtilizationOfUnsecuredLines'] < 3]

## 3、处理后数据

总体以直方图呈现的数据分布情况

In [None]:

data=data.drop(["ID"],axis=1)
data.hist(bins=50, figsize=(20,15))
plt.show()

# 三、探索性分析

客户收入和年龄分布如下图所示，可以看到两个变量都大致呈正态分布，符合统计分析的假设。

In [None]:
plt.figure(figsize=(15,5))
plt.hist(data.MonthlyIncome,bins=70,alpha=0.8,rwidth=0.9)
plt.title("MonthlyIncome distribution")
plt.ylabel('value of MonthlyIncome', fontsize=12)
plt.xlabel('MonthlyIncome', fontsize=12)
 
plt.show()

In [None]:
MI = data['MonthlyIncome']
sns.distplot(MI)

In [None]:
plt.figure(figsize=(15,5))
plt.hist(data.age,bins=50,alpha=0.8,rwidth=0.9)
plt.title("age distribution")
plt.ylabel('value of age', fontsize=12)
plt.xlabel('age', fontsize=12)
 
plt.show()

In [None]:
age = data['age']
sns.distplot(age)

# 四、变量选择

通过WOE分析方法，即，通过比较指标分箱和对应分箱的违约概率来确定指标是否符合经济意义。

## 1、分箱处理

 变量分箱（binning）是对连续变量离散化（discretization）的一种称呼。
    
 信用评分卡开发中一般有常用的等距分段、等深分段、最优分段。

 其中等距分段（Equval length intervals）是指分段的区间是一致的，比如年龄以十年作为一个分段；

 等深分段（Equal frequency intervals）是先确定分段数量，然后令每个分段中数据数量大致相等；

 最优分段（Optimal Binning）又叫监督离散化（supervised discretizaion），使用递归划分（Recursive Partitioning）将连续变量分为分段，背后是一种基于条件推断查找较佳分组的算法。

 我们首先选择对连续变量进行最优分段，在连续变量的分布不满足最优分段的要求时，再考虑对连续变量进行等距分段。最优分箱的代码如下：

In [None]:
from sklearn.model_selection import train_test_split
Y = data['SeriousDlqin2yrs']
X = data.iloc[:, 1:]
#测试集占比30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
train = pd.concat([Y_train, X_train], axis=1)
test = pd.concat([Y_test, X_test], axis=1)
test.to_csv('TestData.csv',index=False)

In [None]:
# 定义自动分箱函数
 
from scipy import stats
def mono_bin(Y, X, n = 20):
    r = 0
    good=Y.sum()
    bad=Y.count()-good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates="drop")}) 
        # 后面报错You can drop duplicate edges by setting the 'duplicates' kwarg，所以回到这里补充duplicates参数
        # pandas中使用qcut()，边界易出现重复值，如果为了删除重复值设置 duplicates=‘drop’，则易出现于分片个数少于指定个数的问题
        # 经尝试，设置duplicates参数为“drop”可行，而不能设置为“raise”。
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.X.mean(), d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
    d4 = (d3.sort_values(by='min')).reset_index(drop=True)
    print("=" * 60)
    woe=list(d4['woe'].round(3))
    print(d4)
    return d4

 将每个变量都尝试分箱

In [None]:
mono_bin(data.SeriousDlqin2yrs,data['DebtRatio'])

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.RevolvingUtilizationOfUnsecuredLines)

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.age)

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.MonthlyIncome)

In [None]:
mono_bin(data.SeriousDlqin2yrs,data['NumberOfTime30-59DaysPastDueNotWorse'])

In [None]:
mono_bin(data.SeriousDlqin2yrs,data['NumberOfTime60-89DaysPastDueNotWorse'])

In [None]:
mono_bin(data.SeriousDlqin2yrs,data['NumberOfTimes90DaysLate'])

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.NumberRealEstateLoansOrLines)

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.NumberOfDependents)

In [None]:
mono_bin(data.SeriousDlqin2yrs,data.NumberOfOpenCreditLinesAndLoans)

从上面每个变量的分箱情况可以看出，DebtRatio、RevolvingUtilizationOfUnsecuredLines、age、MonthlyIncome是可以得出有效的分箱的，NumberOfTime30-59DaysPastDueNotWorse、NumberOfDependents的分箱非常不均匀，也不是符合最优分箱的，而NumberOfOpenCreditLinesAndLoans、NumberOfTimes90DaysLate、NumberOfTime60-89DaysPastDueNotWorse、NumberRealEstateLoansOrLines最终结果只分得一个组，那么针对不能最优分箱的变量，下面将进行自定义分箱。


In [None]:
# 连续变量离散化
pinf = float('inf')#正无穷大
ninf = float('-inf')#负无穷大
 
cutx6 = [ninf, 1, 2, 3, 5, pinf]
cutx7 = [ninf, 0, 1, 3, 5, pinf]
cutx8 = [ninf, 0,1,2, 3, pinf]
cutx9 = [ninf, 0, 1, 3, pinf]

## 2、WOE

WoE分析， 是对指标分箱、计算各个档位的WoE值并观察WoE值随指标变化的趋势。其中WoE的数学定义是:
$$
woe=ln(goodattribute/badattribute)
$$
在进行分析时，我们需要对各指标从小到大排列，并计算出相应分档的WoE值。其中正向指标越大，WoE值越小；反向指标越大，WoE值越大。正向指标的WoE值负斜率越大，反响指标的正斜率越大，则说明指标区分能力好。WoE值趋近于直线，则意味指标判断能力较弱。若正向指标和WoE正相关趋势、反向指标同WoE出现负相关趋势，则说明此指标不符合经济意义，则应当予以去除。


## 3、相关性分析

相关性分析进行初步的检查，进一步检查模型的IV作为变量筛选的依据。

In [None]:
# 数据集各变量的相关性。

corr = data.corr()#计算各变量的相关性系数
xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴标签
yticks = list(corr.index)#y轴标签
fig = plt.figure()
ax1 = fig.add_subplot(1, 1, 1)
# 通过seaborn包，调用heatmap()绘图函数进行绘制相关性图
sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax1, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})#绘制相关性系数热力图
ax1.set_xticklabels(xticks, rotation=0, fontsize=10)
ax1.set_yticklabels(yticks, rotation=0, fontsize=10)
plt.show()

 从热力图可以看出，各变量之间的相关性很小。

接下来，进一步计算每个变量的Infomation Value（IV）。IV指标是一般用来确定自变量的预测能力。 

其公式为：
$$
IV=sum((goodattribute-badattribute)*ln(goodattribute/badattribute))
$$
通过IV值判断变量预测能力的标准是：
* < 0.02: unpredictive
* 0.02 to 0.1: weak
* 0.1 to 0.3:  medium
* 0.3 to 0.5:  strong
* \> 0.5:      suspicious


In [None]:
# IV的实现放在mono_bin()函数里面，代码实现如下：
 
# 定义自动分箱函数
def mono_bin(Y, X, n = 20):
    r = 0
    good=Y.sum()
    bad=Y.count()-good
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates="drop")})
        d2 = d1.groupby('Bucket', as_index = True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
    d3['min']=d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
    d3['goodattribute']=d3['sum']/good
    d3['badattribute']=(d3['total']-d3['sum'])/bad
    iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
    d4 = (d3.sort_values(by = 'min')).reset_index(drop=True)
    print("=" * 60)
    print(d4)
    cut=[]
    cut.append(float('-inf'))
    for i in range(1,n+1):
        qua=X.quantile(i/(n+1))
        cut.append(round(qua,4))
    cut.append(float('inf'))
    woe=list(d4['woe'].round(3))
    return d4,iv,cut,woe
 
#自定义分箱函数
def self_bin(Y,X,cat):
    good=Y.sum()
    bad=Y.count()-good
    d1=pd.DataFrame({'X':X,'Y':Y,'Bucket':pd.cut(X,cat)})
    d2=d1.groupby('Bucket', as_index = True)
    d3 = pd.DataFrame(d2.X.min(), columns=['min'])
    d3['min'] = d2.min().X
    d3['max'] = d2.max().X
    d3['sum'] = d2.sum().Y
    d3['total'] = d2.count().Y
    d3['rate'] = d2.mean().Y
    d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad))
    d3['goodattribute'] = d3['sum'] / good
    d3['badattribute'] = (d3['total'] - d3['sum']) / bad
    iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum()
    d4 = (d3.sort_values(by='min'))
    print("=" * 60)
    print(d4)
    woe = list(d4['woe'].round(3))
    return d4, iv,woe

In [None]:
dfx1, ivx1,cutx1,woex1 = mono_bin(data.SeriousDlqin2yrs, data.RevolvingUtilizationOfUnsecuredLines,n=10)
dfx2, ivx2,cutx2,woex2 = mono_bin(data.SeriousDlqin2yrs, data.age, n=10)
dfx4, ivx4,cutx4,woex4 =mono_bin(data.SeriousDlqin2yrs, data.DebtRatio, n=20)
dfx5, ivx5,cutx5,woex5 =mono_bin(data.SeriousDlqin2yrs, data.MonthlyIncome, n=10)
 
# 连续变量离散化
cutx3 = [ninf, 0, 1, 3, 5, pinf]
cutx6 = [ninf, 1, 2, 3, 5, pinf]
cutx7 = [ninf, 0, 1, 3, 5, pinf]
cutx8 = [ninf, 0,1,2, 3, pinf]
cutx9 = [ninf, 0, 1, 3, pinf]
cutx10 = [ninf, 0, 1, 2, 3, 5, pinf]
dfx3, ivx3,woex3 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
dfx6, ivx6,woex6= self_bin(data.SeriousDlqin2yrs, data['NumberOfOpenCreditLinesAndLoans'], cutx6)
dfx7, ivx7,woex7 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTimes90DaysLate'], cutx7)
dfx8, ivx8,woex8 = self_bin(data.SeriousDlqin2yrs, data['NumberRealEstateLoansOrLines'], cutx8)
dfx9, ivx9,woex9 = self_bin(data.SeriousDlqin2yrs, data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
dfx10, ivx10,woex10 = self_bin(data.SeriousDlqin2yrs, data['NumberOfDependents'], cutx10)

In [None]:
# 生成的IV图代码：
 
ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10]#各变量IV
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴的标签
fig1 = plt.figure(1)
ax1 = fig1.add_subplot(1, 1, 1)
x = np.arange(len(index))+1
ax1.bar(x, ivlist, width=0.4)#生成柱状图
ax1.set_xticks(x)
ax1.set_xticklabels(index, rotation=0, fontsize=12)
ax1.set_ylabel('IV(Information Value)', fontsize=14)
#在柱状图上添加数字标签
for a, b in zip(x, ivlist):
    plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10)
plt.show()

输出的各变量IV值如上图。

可以看出，DebtRatio(x4)、MonthlyIncome(x5)、NumberOfOpenCreditLinesAndLoans(x6)、NumberRealEstateLoansOrLines(x8)和NumberOfDependents(x10)变量的IV值明显较低，所以予以删除。

# 五、模型分析

证据权重（Weight of Evidence,WOE）转换可以将Logistic回归模型转变为标准评分卡格式。引入WOE转换的目的并不是为了提高模型质量，只是一些变量不应该被纳入模型，这或者是因为它们不能增加模型值，或者是因为与其模型相关系数有关的误差较大，其实建立标准信用评分卡也可以不采用WOE转换。这种情况下，Logistic回归模型需要处理更大数量的自变量。尽管这样会增加建模程序的复杂性，但最终得到的评分卡都是一样的。

在建立模型之前，我们需要将筛选后的变量转换为WoE值，便于信用评分。

## 1、WOE转换

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn import metrics
import statsmodels.api as sm
import math
 
def trans_woe(var,var_name,x_woe,x_cut):
    woe_name = var_name + '_woe'
    for i in range(len(x_woe)):
        if i == 0:
            var.loc[(var[var_name]<=x_cut[i+1]),woe_name] = x_woe[i]
        elif (i>0) and (i<= len(x_woe)-2):
            var.loc[((var[var_name]>x_cut[i])&(var[var_name]<=x_cut[i+1])),woe_name] = x_woe[i]
        else:
            var.loc[(var[var_name]>x_cut[len(x_woe)-1]),woe_name] = x_woe[len(x_woe)-1]
    return var
 
x1_name = 'RevolvingUtilizationOfUnsecuredLines'
x2_name = 'age'
x3_name = 'NumberOfTime30-59DaysPastDueNotWorse'
x7_name = 'NumberOfTimes90DaysLate'
x9_name = 'NumberOfTime60-89DaysPastDueNotWorse'

X_train = trans_woe(X_train,x1_name,woex1,cutx1)
X_train = trans_woe(X_train,x2_name,woex2,cutx2)
X_train = trans_woe(X_train,x3_name,woex3,cutx3)
X_train = trans_woe(X_train,x7_name,woex7,cutx7)
X_train = trans_woe(X_train,x9_name,woex9,cutx9)

## 2、Logisic模型建立和模型检验

直接调用statsmodels包来实现逻辑回归

In [None]:
X_train.to_csv('WoeData.csv', index=False)
#6.2 Logistic模型建立
#导入数据
data = pd.read_csv('WoeData.csv')
#应变量
Y=train['SeriousDlqin2yrs']
# dict_Y = {'month':Y.index,'numbers':Y.values}
# df_month = pd.DataFrame(dict_Y)
#自变量，剔除对因变量影响不明显的变量
X=X_train.drop(['RevolvingUtilizationOfUnsecuredLines','age','NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X1=sm.add_constant(X)
Y1=Y.to_frame(name='SeriousDlqin2yrs')
logit=sm.Logit(Y,X1)
result=logit.fit()
print(result.summary())
X1.columns

逻辑回归各变量都已通过显著性检验，满足要求

使用在建模开始阶段预留的test数据进行检验。通过ROC曲线和AUC来评估模型的拟合能力。
在Python中，可以利用sklearn.metrics，它能方便比较两个分类器，自动计算ROC和AUC。

In [None]:
#6.3 模型检验
 
X_test = trans_woe(X_test,x1_name,woex1,cutx1)
X_test = trans_woe(X_test,x2_name,woex2,cutx2)
X_test = trans_woe(X_test,x3_name,woex3,cutx3)
X_test = trans_woe(X_test,x7_name,woex7,cutx7)
X_test = trans_woe(X_test,x9_name,woex9,cutx9)
 
#应变量
Y_test = test['SeriousDlqin2yrs']
#自变量，剔除对因变量影响不明显的变量，与模型变量对应
X_test = X_test.iloc[:,-5:]
#X_test =X_test.drop(['NumberOfOpenCreditLinesAndLoans','age','NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'], axis=1)
X3 = sm.add_constant(X_test)
resu = result.predict(X3)#进行预测
#result.score(X3,Y_test) 
fpr, tpr, threshold = metrics.roc_curve(Y_test, resu)
rocauc = metrics.auc(fpr, tpr)#计算AUC
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc)#生成ROC曲线
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

AUC值为0.85，表明该模型的预测效果不错，正确率较高。

# 六、评估结果

评分标准
$$
a=log(p_{good}/P_{bad})
$$
$$
Score = offset + factor * log(odds)
$$

取600分为基础分值，PDO(比率翻倍的分值)为20（每高20分好坏比翻一倍），好坏比取20。

In [None]:
p = 20 / math.log(2)
q = 600 - 20 * math.log(20) / math.log(2)
#7.2 部分评分
#计算分数函数 
def get_score(coe,woe,factor):
    scores=[]
    for w in woe:
        score=round(coe*w*factor,0)
        scores.append(score)
    print(scores)    
    return scores
   
 
 
coe = [2.6138,0.6228,0.4894,0.5596,0.5747 ,0.4248]#来自前面的logistic的结果
baseScore = round(q + p * coe[0], 0)
print(baseScore)

In [None]:
# 各项部分分数
x1 = get_score(coe[1], woex1, p)
x2 = get_score(coe[2], woex2, p)
x3 = get_score(coe[3], woex3, p)
x7 = get_score(coe[4], woex7, p)
x9 = get_score(coe[5], woex9, p)

根据变量来计算分数

In [None]:
def compute_score(series,cut,score):
    list = []
    i = 0
    while i < len(series):
        value = series[i]
        j = len(cut) - 2
        m = len(cut) - 2
        while j >= 0:
            if value >= cut[j]:
                j = -1
            else:
                j -= 1
                m -= 1
        list.append(score[m])
        i += 1
    return list

In [None]:
test1 = pd.read_csv('TestData.csv')
test1['BaseScore']=np.zeros(len(test1))+baseScore
test1['x1'] = compute_score(test1['RevolvingUtilizationOfUnsecuredLines'], cutx1, x1)
test1['x2'] = compute_score(test1['age'], cutx2, x2)
test1['x3'] = compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3)
test1['x7'] = compute_score(test1['NumberOfTimes90DaysLate'], cutx7, x7)
test1['x9'] = compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'], cutx9, x9)
test1['Score'] = test1['x1'] + test1['x2'] + test1['x3'] + test1['x7'] +test1['x9']  + baseScore
test1.to_csv(r'ScoreData.csv', index=False)

In [None]:
test1.head(20)