In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib import cm

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import norm

## 介绍
本文将通过银行用户的收支借贷等信息来对用户的信用进行评估，并最终给出其在两年内逾期不还款的概率。  
模型开发整体包含以下流程：  
1.数据读取  
2.数据预处理  
包括缺失值及异常值的处理  
3.关联性评估及变量选择  
通过统计学方法（热力图）评估各变量对目标值SeriousDlqin2yrs的影响  
4.模型开发  
基于XGBboost及随机森林模型  
5.信用评分  
ROC曲线可视化及AUC值评估



通过数据文件Data Dictionary.xls文件可知变量描述如下：

| 序号 | 变量名 | 变量描述 |
|-|-|-|
|1|SeriousDlqin2yrs|近两年内是否出现逾期90天及以上的情况(目标值)|
|2|RevolvingUtilizationOfUnsecuredLines|无担保贷款数量占总贷款数比率|
|3|age|借款人年龄|
|4|NumberOfTime30-59DaysPastDueNotWorse|近两年内逾期30-59天的次数|
|5|DebtRatio|负债比|
|6|MonthlyIncome|月收入|
|7|NumberOfOpenCreditLinesAndLoans|车房贷的数量|
|8|NumberOfTimes90DaysLate|过往逾期90天及以上的次数|
|9|NumberRealEstateLoansOrLines|不动产抵押贷款的数量|
|10|NumberOfTime60-89DaysPastDueNotWorse|近两年内逾期60-89天的次数|
|11|NumberOfDependents|家属数量|


## 1.数据读取

In [None]:
df_train = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
df_test = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')
df_s = pd.read_csv('../input/GiveMeSomeCredit/sampleEntry.csv')

## 2.数据预处理

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

### 2.1删除异常值

In [None]:
df_train['Id'] = df_train['Unnamed: 0']
df_train.drop('Unnamed: 0',axis = 1,inplace = True)
df_train.head()

In [None]:
df_test['Id'] = df_test['Unnamed: 0']
df_test.drop('Unnamed: 0',axis = 1,inplace = True)
df_test.head()

In [None]:
df_train.describe()

### 2.2空值筛选

In [None]:
df_train.isnull().sum()

In [None]:
df_train.nunique()

In [None]:
df_test.isnull().sum()

从上述样本描述来看，  
变量MonthlyIncome缺失较大，无法通过删除有缺史值的样本进行处理，同时相对于十万数量级的样本来说，3万左右的缺失又没有大到足以删除整个变量，所以采取填充处理。  
而变量NumberOfDependents缺失很少，如何处理对整体影响不大。

### 2.3空值填充
“二八定律”告诉我们，往往20%的人掌握了80%的社会财富，所以显然平均数无法用来衡量样本整体收入。  
这里我们用pin数填充MonthlyIncome。  
而NumberOfDependents变量用平均数或众数其实都可以，加之缺失较少，所以影响很小。这里选用了众数填充。

In [None]:
df_train['MonthlyIncome'].fillna(df_train['MonthlyIncome'].median(),inplace = True)
df_train['NumberOfDependents'].fillna(df_train['NumberOfDependents'].mode()[0],inplace = True)
df_test['MonthlyIncome'].fillna(df_test['MonthlyIncome'].median(),inplace = True)
df_test['NumberOfDependents'].fillna(df_test['NumberOfDependents'].mode()[0],inplace = True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
#先看看训练集中的目标值分布情况
sns.countplot(x = 'SeriousDlqin2yrs',data = df_train)

plt.show()

## 3.关联性评估及变量选择

In [None]:
#看看变量的热力图吧
f,ax = plt.subplots(figsize = (10,10))
sns.heatmap(df_train.corr(),annot = True,cmap = 'BrBG',linewidths = .9,fmt = '.4f',ax = ax)

plt.show()

显然ID变量对目标值基本没啥影响。

In [None]:
Id = df_test['Id']
df_train.drop('Id',axis = 1,inplace = True)
df_test.drop('Id',axis = 1,inplace = True)

## 4.模型开发

In [None]:
x = df_train.drop('SeriousDlqin2yrs',axis = 1)
y = df_train['SeriousDlqin2yrs']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x.values,y.values,test_size = 0.3,random_state = 116214)

### 4.1随机森林分类模型

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
RF = RandomForestClassifier()

In [None]:
param_grid = {
    "n_estimators" : [9,18,27,36,100,150],
    "max_depth" : [2,3,5,7,9],
    "min_samples_leaf" : [2,4,6,8]
}

In [None]:
RF_random = RandomizedSearchCV(RF,param_distributions = param_grid,cv = 5)

In [None]:
RF_random.fit(x_train,y_train)

In [None]:
best_est_RF = RF_random.best_estimator_

In [None]:
print('训练集分类准确率:{:.2f}'.format(RF_random.score(x_train,y_train)*100))
print('测试集分类准确率:{:.2f}'.format(RF_random.score(x_test,y_test)*100))

In [None]:
y_pred = best_est_RF.predict_proba(x_train)
y_pred = y_pred[:,1]

In [None]:
##ROC曲线可视化
from sklearn.metrics import roc_curve,auc

fpr,tpr,_ = roc_curve(y_train,y_pred)
roc_auc = auc(fpr,tpr)
plt.figure(figsize = (10,8))
plt.title('Reciver Operating Characteristic')
sns.lineplot(fpr,tpr,label = 'AUC = %0.4f' % roc_auc)

plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlabel('假阳率')
plt.ylabel('真阳率')

plt.show()

In [None]:
df_test.drop('SeriousDlqin2yrs', axis = 1, inplace = True)
y_pred = best_est_RF.predict_proba(df_test)
y_pred = y_pred[:,1]

In [None]:
df_s["Probability"] = y_pred
df_s.head()

In [None]:
df_s.to_csv("submission1.csv",index = False)

### 4.2加入XGB分类器

In [None]:
XGB = XGBClassifier(n_jobs = -1)

param_grid = {
    'n_estimators' : [100,150,200,250,300],
    "learning_rate" : [0.001,0.01,0.0001,0.05,0.10],
    "gamma" : [0.0,0.1,0.2,0.3],
    "colsample_bytree" : [0.5,0.7],
    'max_depth' : [3,4,6,8]
}

In [None]:
XGB_random = RandomizedSearchCV(XGB,param_distributions = param_grid,cv = 5)

In [None]:
XGB_random.fit(x_train,y_train)

In [None]:
best_est_XGB = XGB_random.best_estimator_

In [None]:
print('训练集分类准确率:{:.2f}'.format(XGB_random.score(x_train,y_train)*100))
print('测试集分类准确率:{:.2f}'.format(XGB_random.score(x_test,y_test)*100))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)

s_x,s_y=smote.fit_resample(x_train,y_train)

In [None]:
RF_random.fit(s_x,s_y)

In [None]:
best_est_RF1 = RF_random.best_estimator_

In [None]:
print('训练集分类准确率:{:.2f}'.format(RF_random.score(s_x,s_y)*100))
print('测试集分类准确率:{:.2f}'.format(RF_random.score(s_x,s_y)*100))

In [None]:
y_pred_new_RF = best_est_RF1.predict_proba(x_train)
y_pred_new_RF = y_pred_new_RF[:,1]

In [None]:
##ROC曲线可视化
fpr,tpr,_ = roc_curve(y_train,y_pred_new_RF)
roc_auc = auc(fpr,tpr)
plt.figure(figsize = (10,8))
plt.title('Reciver Operating Characteristic')
sns.lineplot(fpr,tpr,label = 'AUC = %0.4f' % roc_auc)

plt.legend(loc = 'lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlabel('假阳率')
plt.ylabel('真阳率')

plt.show()

In [None]:
prediction_RF = best_est_RF1.predict_proba(df_test)
presiction_RF = prediction_RF[:,1]

In [None]:
df_s["Probability"] = prediction_RF
df_s.head()

In [None]:
df_s.to_csv("submission2.csv",index = False)