In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 七步流程

1. 定义问题
2. 获取训练数据和测试数据
3. 整理、准备、清洗数据
4. 分析、发现模式、探索数据
5. 建模、预测、求解问题
6. 可视化、报告、呈现问题求解步骤和最终结论
7. 提交

# 1. 定义问题

基于客户数据，通过预测客户未来两年是否会陷入财务危机的概率来改善银行信用评分的质量。


**导包**

In [None]:
#加载包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2.获取训练数据

In [None]:
data_train = pd.read_csv('../input/givemesc/cs-training.csv')

# 3. 整理、准备、清洗数据
## 3.1. 查看数据

In [None]:
data_train.info()

**查看Data Dictionary.xls文件可知，各数据含义如下所示：**

|                变量名                |                    定义                    |     数据类型      |
| :----------------------------------: | :----------------------------------------: | :---------------: |
|      SeriousDlqin2yrs（目标值）      | 是否有超过90天或更长时间逾期未还的不良行为 | Y/N（0为好1为坏） |
| RevolvingUtilizationOfUnsecuredLines |                可用额度比值                |    percentage     |
|                 age                  |                    年龄                    |      integer      |
| NumberOfTime30-59DaysPastDueNotWorse |              逾期30-59天笔数               |      integer      |
|              DebtRatio               |  还款率(每月偿还债务，赡养费，生活费用).   |    percentage     |
|            MonthlyIncome             |                   月收入                   |       real        |
|   NumberOfOpenCreditLinesAndLoans    |                  信贷数量                  |      integer      |
|       NumberOfTimes90DaysLate        |                逾期90天笔数                |      integer      |
|     NumberRealEstateLoansOrLines     |               固定资产贷款量               |      integer      |
| NumberOfTime60-89DaysPastDueNotWorse |              逾期60-89天笔数               |      integer      |
|          NumberOfDependents          |                  家属数量                  |      integer      |

In [None]:
data_train.head(5)

In [None]:
data_train.describe()

**查看哪些变量具有缺失值**

In [None]:
data_train.isnull().sum()

**可以看出以下结论：**
1. 数据缺失值较少
2. Unamed: 0列为序号，可以直接删去
3. NumberOfDependents缺失值较少，在训练过程中可以直接删去，测试集时可以使用众数填充
4. MonthlyIncome缺失值较多，可以大量填充

## 3.2. 数据清洗
### 3.2.1. 直接删去序号列

In [None]:
data_train=data_train.drop(["Unnamed: 0"],axis=1)

### 3.2.2. 随机森林填补MonthlyIncome缺失值

In [None]:
from sklearn.ensemble import RandomForestRegressor

def fill_missing(data, to_fill):
    df = data.copy()
    columns = [*df.columns]
    columns.remove(to_fill)
    
    # 移除有缺失值的列
    columns.remove('NumberOfDependents')
    X = df.loc[:, columns]
    y = df.loc[:, to_fill]
    X_train = X.loc[df[to_fill].notnull()]
    y_train = y.loc[df[to_fill].notnull()]
    X_pred = X.loc[df[to_fill].isnull()]
    rfr = RandomForestRegressor(random_state=22, n_estimators=200, max_depth=3, n_jobs=-1)
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_pred).round()
    df.loc[df[to_fill].isnull(), to_fill] = y_pred
    return df

### 3.2.3. 直接删除'NumberOfDependents'

In [None]:
data_train = fill_missing(data_train, 'MonthlyIncome')
data_train.dropna(inplace=True)
data_train.shape

### 3.2.4. 异常值处理
age 字段中包含有为 0 的值，通常认为该值为异常值，查看数据可以发现仅有一条数据年龄为0，因此可以直接删除

In [None]:
data_train = data_train[data_train['age'] > 0]

import matplotlib.pyplot as plt
columns = ['NumberOfTime30-59DaysPastDueNotWorse',
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']
data_train.loc[:, columns].plot.box(vert=False)

从业务上考虑，不应当出现这样的高的次数，这里同样删除掉这些异常数据

In [None]:
for col in columns:
    data_train = data_train.loc[data_train[col] < 90]

**封装一个数据处理函数用于之后处理测试数据**

In [None]:
def deal_with_data(df_ori):
    df=df_ori.copy()
    df=df.drop(["Unnamed: 0"],axis=1)
   # print(df.shape)
    x=df['NumberOfDependents'].mode()[0]
    df['NumberOfDependents']=df['NumberOfDependents'].fillna(x)
   # print(df.shape)
    #df.dropna(how='all',inplace=True)
    df = fill_missing(df, 'MonthlyIncome')
    return df

# 4. 分析、发现模式、探索数据
## 4.1. 查看年龄分布

In [None]:
data_train['age'].plot.hist(bins=30);

## 4.2. 使用小于 99% 的分位数的数据查看收入分布

In [None]:
income = data_train['MonthlyIncome']
income.loc[income < 23334].plot.hist(bins=50)

## 4.3. 查看变量之间的相关性

In [None]:
corr = data_train.corr()
plt.subplots(figsize=(12, 12))
sns.heatmap(corr, annot=True, vmax=1, square=True, cmap='Blues')
plt.show()

# 5.建模、预测、求解问题
## 5.1. 数据分箱
### 5.1.1. 定义数据分箱函数，查找最优分箱

In [None]:
# 根据woe以及IV值定义分箱函数
import numpy as np
import pandas as pd
import scipy

def auto_bin(DF, X, Y, n=5, iv=True, detail=False,q=20):
    """
    自动最优分箱函数，基于卡方检验的分箱

    参数：
    DF: DataFrame 数据框
    X: 需要分箱的列名
    Y: 分箱数据对应的标签 Y 列名
    n: 保留分箱个数
    iv: 是否输出执行过程中的 IV 值
    detail: 是否输出合并的细节信息
    q: 初始分箱的个数

    区间为前开后闭 (]

    返回值：

    """


    # DF = df_train
    # X = "age"
    # Y = "SeriousDlqin2yrs"

    DF = DF[[X,Y]].copy()

    # 按照等频对需要分箱的列进行分箱,cut为灯具分箱,qcut为等频分箱
    DF["qcut"],bins = pd.qcut(DF[X], retbins=True, q=q, duplicates="drop")
    # 统计每个分段 0，1的数量
    coount_y0 = DF.loc[DF[Y]==0].groupby(by="qcut")[Y].count()
    coount_y1 = DF.loc[DF[Y]==1].groupby(by="qcut")[Y].count()
    # num_bins值分别为每个区间的上界，下界，0的频次，1的频次
    num_bins = [*zip(bins,bins[1:],coount_y0,coount_y1)]

    # 定义计算 woe 的函数
    def get_woe(num_bins):
        # 通过 num_bins 数据计算 woe
        columns = ["min","max","count_0","count_1"]
        df = pd.DataFrame(num_bins,columns=columns)

        df["total"] = df.count_0 + df.count_1
        df["percentage"] = df.total / df.total.sum()
        df["bad_rate"] = df.count_1 / df.total
        df["woe"] = np.log((df.count_0/df.count_0.sum()) /
                           (df.count_1/df.count_1.sum()))
        return df

    # 创建计算 IV 值函数
    def get_iv(bins_df):
        rate = ((bins_df.count_0/bins_df.count_0.sum()) -
                (bins_df.count_1/bins_df.count_1.sum()))
        IV = np.sum(rate * bins_df.woe)
        return IV


    # 确保每个分组的数据都包含有 0 和 1
    for i in range(20): # 初始分组不会超过20
        # 如果是第一个组没有 0 或 1，向后合并
        if 0 in num_bins[0][2:]:
            num_bins[0:2] = [(
                num_bins[0][0],
                num_bins[1][1],
                num_bins[0][2]+num_bins[1][2],
                num_bins[0][3]+num_bins[1][3])]
            continue

        # 其他组出现没有 0 或 1，向前合并
        for i in range(len(num_bins)):
            if 0 in num_bins[i][2:]:
                num_bins[i-1:i+1] = [(
                    num_bins[i-1][0],
                    num_bins[i][1],
                    num_bins[i-1][2]+num_bins[i][2],
                    num_bins[i-1][3]+num_bins[i][3])]
                break
        # 循环结束都没有出现则提前结束外圈循环
        else:
            break

    # 重复执行循环至分箱保留 n 组：
    while len(num_bins) > n:
        # 获取 num_bins 两两之间的卡方检验的置信度（或卡方值）
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            # 0 返回 chi2 值，1 返回 p 值。
            pv = scipy.stats.chi2_contingency([x1,x2])[1]
            # chi2 = scipy.stats.chi2_contingency([x1,x2])[0]
            pvs.append(pv)

        # 通过 p 值进行处理。合并 p 值最大的两组
        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(
            num_bins[i][0],
            num_bins[i+1][1],
            num_bins[i][2]+num_bins[i+1][2],
            num_bins[i][3]+num_bins[i+1][3])]

        # 打印合并后的分箱信息
        bins_df = get_woe(num_bins)
        if iv:
            print(f"{X} 分{len(num_bins):2}组 IV 值: ",get_iv(bins_df))
        if detail:
            print(bins_df)
    print("\n".join(map(lambda x:f"{x:.16f}",pvs)))
    # 返回分组后的信息
    return get_woe(num_bins)#, get_iv(bins_df)

划分测试集和训练集

In [None]:
from sklearn.model_selection import train_test_split
X = data_train.iloc[:, 1:]
y = data_train.iloc[:, 0]
X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.3, random_state=0)
df_train = pd.concat([y_train, X_train], axis=1)
df_vali = pd.concat([y_vali, X_vali], axis=1)

对每一个分组进行分析，选择合适的分箱数

In [None]:
df_train.columns

# 不能使用自动分享的变量
hand_bins = {"NumberOfTime30-59DaysPastDueNotWorse":[0,1,2,13],
             "NumberOfTimes90DaysLate":[0,1,2,17],
             "NumberRealEstateLoansOrLines":[0,1,2,4,54],
             "NumberOfTime60-89DaysPastDueNotWorse":[0,1,2,8]}

# 手动额按成分享数量的添加
auto_col_bins = {"RevolvingUtilizationOfUnsecuredLines":5,
                 "age":5,
                 "DebtRatio":5,
                 "MonthlyIncome":6,
                 "NumberOfOpenCreditLinesAndLoans":4,
                 "NumberOfDependents":3}

# 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
hand_bins = {k:[-np.inf,*v[1:-1],np.inf] for k,v in hand_bins.items()}

# 用于确定最优分箱的个数和区间 
age_bins_df = auto_bin(df_train, "age", "SeriousDlqin2yrs", n=5, iv=True,detail=False,q=20)
age_bins_df

In [None]:
# 用来保存每个分组的分箱数据 
bins_of_col = {}
# 生成自动分箱的分箱区间和分箱后的 IV 值 
for col in auto_col_bins:
    print(col)
    bins_df = auto_bin(df_train, col, "SeriousDlqin2yrs", n=auto_col_bins[col], iv=False, detail=False, q=20) 
    bins_list = list(sorted(set(bins_df["min"]).union(bins_df["max"])))
    # 保证区间覆盖使用 np.inf 替换最大值 -np.inf 替换最小值
    bins_list[0], bins_list[-1] = -np.inf, np.inf
    bins_of_col[col] = bins_list
bins_of_col

In [None]:
# 合并手动分箱数据 
bins_of_col.update(hand_bins)

### 5.1.2. 变量筛选

In [None]:
# 计算分箱数据的 IV 值
def get_iv(df,col,y,bins):
    df = df[[col,y]].copy()
    df["cut"] = pd.cut(df[col],bins)
    bins_df = df.groupby("cut")[y].value_counts().unstack()
    bins_df["woe"] = np.log((bins_df[0] / bins_df[0].sum()) /
                     (bins_df[1] / bins_df[1].sum()))
    iv = np.sum((bins_df[0] / bins_df[0].sum() -
          bins_df[1] / bins_df[1].sum())*bins_df.woe)
    return iv ,bins_df

# 保存 IV 值信息 
info_values = {}
# 保存 woe 信息 
woe_values = {}
for col in bins_of_col:
    iv_woe = get_iv(df_train, col, "SeriousDlqin2yrs", bins_of_col[col])
    info_values[col], woe_values[col] = iv_woe

In [None]:
def plt_iv(info_values):
    keys,values = zip(*info_values.items())
    nums = range(len(keys)) 
    plt.barh(nums,values) 
    plt.yticks(nums,keys)
    for i, v in enumerate(values):
        plt.text(v, i-.2, f"{v:.2f}")
plt_iv(info_values)

In [None]:
bins_of_col

可以看出 NumberRealEstateLoansOrLines 和 NumberOfDependents 变量的 IV 值明显较低，所 以予以删除。DebtRatio、MonthlyIncome、NumberOfOpenCreditLinesAndLoans 等变量可以考虑删 除也可以予以保留。

## 5.2. Logistic回归
### 5.2.1 WOE转换
通过生成的分箱和 WOE 数据

In [None]:
model_woe = pd.DataFrame(index=df_train.index)
for col in bins_of_col:
    model_woe[col] = pd.cut(df_train[col],bins_of_col[col]).map(woe_values[col]["woe"])
model_woe["SeriousDlqin2yrs"] = df_train["SeriousDlqin2yrs"]

### 5.2.2. 构建回归模型

In [None]:
# 直接调用statsmodels包来实现逻辑回归
import statsmodels.api as sm

data = model_woe.copy()
# 设置因变量
endog = data['SeriousDlqin2yrs']
X = data.drop(["SeriousDlqin2yrs",
               "NumberRealEstateLoansOrLines",
               "NumberOfDependents"],axis=1)
# 设置自变量
exog = sm.add_constant(X)
logit = sm.Logit(endog,exog)
result = logit.fit()
result.summary()

# 6. 可视化、报告、呈现问题求解步骤和最终结论
## 6.1. 模型检验

In [None]:
vali_woe = pd.DataFrame(index=df_vali.index)
for col in bins_of_col:
    vali_woe[col] = pd.cut(df_vali[col],bins_of_col[col]).map(woe_values[col]["woe"])
vali_woe["SeriousDlqin2yrs"] = df_vali["SeriousDlqin2yrs"]
vali_Y = vali_woe['SeriousDlqin2yrs']
vali_X = vali_woe.drop(["SeriousDlqin2yrs",
                        "NumberRealEstateLoansOrLines",
                        "NumberOfDependents"],axis=1)
vali_exog = sm.add_constant(vali_X)
vali_proba = result.predict(vali_exog)
vali_proba_df = pd.DataFrame(vali_proba,columns=[1]) 
vali_proba_df.insert(0,0,1-vali_proba_df)

import scikitplot as skplt
# 预测结果为对应 1 的概率，转换为数组用于绘图 

skplt.metrics.plot_roc(vali_Y,
                       vali_proba_df,
                       plot_micro=False, plot_macro=False);

## 6.2. 预测测试数据
### 6.2.1. 读入测试数据

In [None]:
data_test=pd.read_csv('../input/givemesc/cs-test.csv')

首先删除空列（目标列）SeriousDlqin2yrs

In [None]:
data_test=data_test.drop(['SeriousDlqin2yrs'],axis=1)

查看测试数据信息

In [None]:
data_test.info()

处理测试数据

In [None]:
data_test=deal_with_data(data_test)

检查测试数据空值是否都被填充

In [None]:
data_test.info()

### 6.2. 对测试数据进行分箱，并计算WOE值

In [None]:
test = pd.DataFrame(index=data_test.index)
for col in bins_of_col:
    test[col] = pd.cut(data_test[col],bins_of_col[col]).map(woe_values[col]["woe"])

In [None]:
test.head()

### 6.3. 根据模型预测

In [None]:
test_X = test.drop(["NumberRealEstateLoansOrLines",
                        "NumberOfDependents"],axis=1)
test_exog = sm.add_constant(test_X)
test_proba = result.predict(test_exog)

In [None]:
ans=pd.DataFrame(index=test.index,columns=['Id','Probability'])
ans['Id']=ans.index+1
ans['Probability']=test_proba
ans

# 7. 提交结果

In [None]:
ans.to_csv("./submission.csv",index=False)