# 一：导入数据

读取数据导入相关的模块

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os, datetime, sys, random, time
 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
 
plt.style.use('fivethirtyeight')
%matplotlib inline
 
from scipy import stats, special
import shap                # 
 
import warnings
warnings.filterwarnings('ignore')
 
train_data=pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
test_data=pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")
 
print(train_data.head())

显示train数据集里面

In [None]:
train_data.info()

通过查看，可以知道总数据有150000条，其中有MonthlyIncome 和 NumberOfDependents，存在空值，需要进行处理

其中各个变量的含义如下：
1. SeriousDlqin2yrs：好坏用户
1.  RevolvingUtilizationOfUnsecuredLines：除了房贷车贷之外的信用卡账面金额（即贷款金额）/信用卡总额度
1. age：贷款人年龄
1. NumberOfTime30-59DaysPastDueNotWorse：35-59天逾期但不糟糕次数
1. DebtRatio：负债比率
1. MonthlyIncome：月收入
1. NumberOfOpenCreditLinesAndLoans：开放式信贷和贷款数量，开放式贷款（分期付款如汽车贷款或抵押贷款）和信贷（如信用卡）的数量
1. NumberOfTimes90DaysLate：借款者有90天或更高逾期的次数
1. NumberRealEstateLoansOrLines：不动产贷款或额度数量
1. NumberOfTime60-89DaysPastDueNotWorse：60-89天逾期但不糟糕次数
1. NumberOfDependents：不包括本人在内的家属数量

# 二：数据清洗与探索

因为id数据无用，这里先删除；

In [None]:
# 删除 id 
dev_train=train_data.drop("Unnamed: 0",axis=1)
# 测试集也做同样操作
dev_test=test_data.drop("Unnamed: 0",axis=1)

查看各列数据分布情况：

In [None]:
print(dev_train.describe())

通过上边的数据分析发现：
* SeriousDlqin2yrs：的分布不是均衡的，这代表正负样本的比例有显著失衡

* RevolvingUtilizationOfUnsecuredLines：的最大值和最小值很极端，但均值却很小，代表数据离散值较多。

* age: 最小值有0，应该属于异常值，可能是空值导致的。最大值109也是不正常的。

* NumberOfTime30-59DaysPastDueNotWorse, NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse三种的最大值都是98，标准差近似，可能具有相关性

由此，开始进行可视化分析：

In [None]:
# 检查数据正负样本是否平衡
fig,axes=plt.subplots(1,2,figsize=(12,6))
# pandas自带绘图
dev_train['SeriousDlqin2yrs'].value_counts().plot.pie(explode=[0,0.1],autopct="%1.1f%%",ax=axes[0])
axes[0].set_title("SeriousDlqin2yrs")
sns.countplot("SeriousDlqin2yrs",data=dev_train,ax=axes[1])
axes[1].set_title("SeriousDlqin2yrs")
plt.show()

通过上边的两个图可以看出正负样本失衡严重，这可以考虑通过欠采样解决；

正相关性分析：

In [None]:
dev_train.boxplot(column=['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate'],figsize=(15,5))

进一步分析：以下三个特征NumberOfTime30-59DaysPastDueNotWorse, NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse三者大于80的值

In [None]:
# NumberOfTime30-59DaysPastDueNotWorse大于80的value_counts
print(dev_train[dev_train['NumberOfTime30-59DaysPastDueNotWorse']>=80]
      ['NumberOfTime30-59DaysPastDueNotWorse'].value_counts())
# NumberOfTime60-89DaysPastDueNotWorse大于80的value_counts
print(dev_train[dev_train['NumberOfTime60-89DaysPastDueNotWorse']>=80]
      ['NumberOfTime60-89DaysPastDueNotWorse'].value_counts())
# NumberOfTimes90DaysLate大于80的value_counts
print(dev_train[dev_train['NumberOfTimes90DaysLate']>=80]
      ['NumberOfTimes90DaysLate'].value_counts())

发现结果相同

再分析当'NumberOfTime30-59DaysPastDueNotWorse>80时 NumberOfTime60-89DaysPastDueNotWorse，NumberOfTimes90DaysLate的结果

In [None]:
print(np.unique(dev_train[dev_train['NumberOfTime30-59DaysPastDueNotWorse']>=80]
                     ['NumberOfTime60-89DaysPastDueNotWorse']))
 
print(np.unique(dev_train[dev_train['NumberOfTime30-59DaysPastDueNotWorse']>=80]
                     ['NumberOfTimes90DaysLate']))

发现数据依旧相同

再查看80以下的数据

In [None]:
# NumberOfTime30-59DaysPastDueNotWorse小于80的value_counts
print(dev_train[dev_train['NumberOfTime30-59DaysPastDueNotWorse']<80]
      ['NumberOfTime30-59DaysPastDueNotWorse'].value_counts())
# NumberOfTime60-89DaysPastDueNotWorse小于80的value_counts
print(dev_train[dev_train['NumberOfTime60-89DaysPastDueNotWorse']<80]
      ['NumberOfTime60-89DaysPastDueNotWorse'].value_counts())
# NumberOfTimes90DaysLate小于80的value_counts
print(dev_train[dev_train['NumberOfTimes90DaysLate']<80]
      ['NumberOfTimes90DaysLate'].value_counts())

查看结果：这三者的最大值分别为13，11，17

相对于整个数据样本的数量，这部分[96,98]异常值可以去除。但考虑到test数据也会发生类似结果，我们也可以用三者的最大值替换掉现在的异常结果，而且，三者最大值的记录都是1，也表明极大值也是极稀少的。

更正异常值

In [None]:
dev_train.loc[dev_train['NumberOfTime30-59DaysPastDueNotWorse'] >= 80, 'NumberOfTime30-59DaysPastDueNotWorse'] = 13
dev_train.loc[dev_train['NumberOfTime60-89DaysPastDueNotWorse'] >= 80, 'NumberOfTime60-89DaysPastDueNotWorse'] = 11
dev_train.loc[dev_train['NumberOfTimes90DaysLate'] >= 80, 'NumberOfTimes90DaysLate'] = 17

In [None]:
# 更正异常值
dev_test.loc[dev_test['NumberOfTime30-59DaysPastDueNotWorse'] >= 80, 'NumberOfTime30-59DaysPastDueNotWorse'] = 13
dev_test.loc[dev_test['NumberOfTime60-89DaysPastDueNotWorse'] >= 80, 'NumberOfTime60-89DaysPastDueNotWorse'] = 11
dev_test.loc[dev_test['NumberOfTimes90DaysLate'] >= 80, 'NumberOfTimes90DaysLate'] = 17

负相关性分析：

结合上面的分析，我们我们觉得DebtRatio可能与其他负相关因素也有某种关联。所以，我们先对DebtRatio进行进一步分析，绘制DebtRation的箱线图

In [None]:
#绘制箱线图
dev_train.boxplot(column=['DebtRatio'],figsize=(5,5))

可以发现，比率值异常

In [None]:
quantiles=[x for x in range(75,100,3)]
for i in quantiles:
    print(i,'% quantile of debt ratio is: ',dev_train.DebtRatio.quantile(i/100))

通过对75%位点以上的数据进行分析，可以看到在81%时，DebtRatio明显增大。

结合DebtRatio，再观察MonthlyIncome，age以及其他几项指标：

In [None]:
print(dev_train[dev_train['DebtRatio'] >= 
           dev_train['DebtRatio'].quantile(0.95)][['age',
            'MonthlyIncome','RevolvingUtilizationOfUnsecuredLines',
            'NumberOfOpenCreditLinesAndLoans',
            'NumberRealEstateLoansOrLines',
            'NumberOfDependents']].describe())

* 我们把分位数换成DebtRatio的值，依次下探，在DebtRatio值为500时，总记录20614，MonthlyIncome有 1305个不为空，986为0，319为1，比率近3：1；  
* NumberOfDependents 有 18803个不为空， 14444个为0；
* 我们设立一个空值填充规则，在DebtRatio>=500时，将MonthlyIncome的空值设为0， NumberOfDependents 设为0
* 当数值小于500时，MonthlyIncome的空值则设为中位数，NumberOfDependents设为中位值。


In [None]:
# 对空值用中位数填充
dev_train['NumberOfDependents'].fillna(dev_train['NumberOfDependents'].median(), inplace=True)
dev_train.loc[(dev_train['DebtRatio']>=500)&(dev_train['MonthlyIncome'].isnull()),'MonthlyIncome']=0.0
dev_train.loc[(dev_train['DebtRatio']<500)&(dev_train['MonthlyIncome'].isnull()),
              'MonthlyIncome']=dev_train['MonthlyIncome'].mean()

再观察age的异常值

In [None]:
# 处理年纪等于0的数据，发现只有一条，于是用中位数进行替换
print(dev_train.loc[dev_train['age'] < 18])
dev_train.loc[dev_train['age'] == 0, 'age'] = dev_train['age'].median()

在各列之间做相关性分析：

In [None]:
fig=plt.figure(figsize=[15,10])
masked = np.zeros_like(dev_train.corr(), dtype=np.bool)
masked[np.triu_indices_from(masked)] = True
sns.heatmap(dev_train.corr(), cmap=sns.diverging_palette(150, 275, s=80, l=55, n=9), mask = masked, annot=True, center = 0)
plt.title("Correlation Matrix (HeatMap)", fontsize = 15)

通过上图可以看到，与标签SeriousDlqin2yrs相关性最高的是：NumberOfTime30-59DaysPastDueNotWorse , NumberOfTime60-89DaysPastDueNotWorse 和 NumberOfTimes90DaysLate。

# 三：基准分析

切分数据集：

In [None]:
from sklearn import preprocessing,metrics,model_selection,ensemble,tree,linear_model
dev_x=dev_train.drop(['SeriousDlqin2yrs'],axis=1)
dev_y=dev_train['SeriousDlqin2yrs']
 
# 切分数据集
X_train,X_val,y_train,y_val=model_selection.train_test_split(dev_x,dev_y,test_size=0.3,random_state=2020)

构建模型：

In [None]:
import lightgbm as lgb
 
lgb_classifer=lgb.LGBMClassifier(objective='binary', # 二分类的log loss
                                 n_jobs=-1, random_state=2020,
                                 importance_type='gain') # 增益作为重要性度量
 
lgbParameters={
    'max_depth' : [2,3,4,5],
    'learning_rate': [0.05, 0.1,0.125,0.15],
    'colsample_bytree' : [0.2,0.4,0.6,0.8,1],
    'n_estimators' : [400,500,600,700,800,900],
    'min_split_gain' : [0.15,0.20,0.25,0.3,0.35], #equivalent to gamma in XGBoost
    'subsample': [0.6,0.7,0.8,0.9,1],
    'min_child_weight': [6,7,8,9,10],
    'scale_pos_weight': [10,15,20],
    'min_data_in_leaf' : [100,200,300,400,500,600,700,800,900],
    'num_leaves' : [20,30,40,50,60,70,80,90,100]
}
# 随机交叉验证
lgbModel=model_selection.RandomizedSearchCV(lgb_classifer,
                                            param_distributions=lgbParameters,
                                            cv=5, # 5折交叉验证
                                            random_state=2020
                                            )
# 开始训练
lgbModel.fit(X_train,y_train,feature_name=X_train.columns.to_list())

获取模型的最好参数：

In [None]:
bestEstimatorLGB=lgbModel.best_estimator_
bestEstimatorLGB

用最优参数，构建模型

In [None]:
# 最优模型的训练
bestEstimatorLGB=lgb.LGBMClassifier(colsample_bytree=0.2,
                                    importance_type='gain',
                                    max_depth=4,
                                    min_child_weight=9,  # 子节点所需的样本权重
                                    min_data_in_leaf=500,
                                    min_split_gain=0.15,  #执行切分的最小增益
                                    n_estimators=500,
                                    num_leaves=80,
                                    objective='binary',
                                    random_state=2020,
                                    scale_pos_weight=10,  ## 正样本的权重
                                    subsample=0.9,  #不进行重采样的情况下随机选择部分数据
                                    ).fit(X_train,y_train,
                                          feature_name=X_train.columns.to_list())

交叉验证的预测结果:

In [None]:
val_test_pred_lgb=bestEstimatorLGB.predict(X_val)
print(metrics.classification_report(y_val,val_test_pred_lgb))

预测结果在各种指标下的评测:

In [None]:
metrics.confusion_matrix(y_val,val_test_pred_lgb)
LGBMMetrics=pd.DataFrame({'Model':'LightGBM',
        'MSE':round(metrics.mean_squared_error(y_val,val_test_pred_lgb)*100,2),
        'RMSE':round(np.sqrt(metrics.mean_squared_error(y_val,val_test_pred_lgb)*100),2),
        'MAE':round(metrics.mean_absolute_error(y_val,val_test_pred_lgb)*100,2),
        'Accuracy Train':round(bestEstimatorLGB.score(X_train,y_train)*100,2),
        'Accuracy Test': round(bestEstimatorLGB.score(X_val,y_val)*100,2),
    'F-Beta Score (B=2)':round(metrics.fbeta_score(y_val,
                                                   val_test_pred_lgb,
                                                   beta=2)*100,2)
                          },index=[1])
 
print(LGBMMetrics)

注：MSE：均方误差；RMSE：均方根误差；MAE：平均绝对误差； F-Beta Score：F2分数（召回率的权重高于精确率）

模型效果估计：在分类模型评估中，最常用的两种评估标准是K-S值和AUC值，AUC值可以在样本不均衡的情况下准确评估模型的好坏，而K-S值不仅能够评估预测的准确与否，还能度量模型对好坏客户是否有足够的区分度。

AUC指标：是指在二分类问题中，模型评估阶段常被用作最重要的评估指标来衡量模型的稳定性。还包括另外两个指标：

1. 真正例率，True Positive Rate：TPR = TP/ (TP+FN)

1. 假正例率， False Postive Rate：FPR = FP/(TN+FP)

绘制AUC曲线：

In [None]:
val_pred_lgb=bestEstimatorLGB.predict_proba(X_val)
val_pred_lgb=val_pred_lgb[:,1]
# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr,tpr,_=metrics.roc_curve(y_val,val_pred_lgb)
rocAuc=metrics.auc(fpr,tpr)#计算AUC的值
plt.figure(figsize=(12,6))
plt.title("ROC Curve")
sns.lineplot(fpr,tpr,label="AUC for LightGBM Model = %0.2f"% rocAuc)
 
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(rocAuc)

可以得到AUC值为0.86，发现预测效果良好

查看gbdt对各个特征的重视程度

In [None]:
lgb.plot_importance(bestEstimatorLGB,importance_type='gain')

# 四：提交

In [None]:
dev_test=dev_test.drop(['SeriousDlqin2yrs'],axis=1)
ids=np.arange(1,101504)
lgb_probs=bestEstimatorLGB.predict_proba(dev_test)
lgb_df=pd.DataFrame({'ID':ids,'Probability':lgb_probs[:, 1]})
lgb_df.to_csv('./submission.csv',index=False)