In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white',context='notebook',palette='muted')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/GiveMeSomeCredit'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 定义问题，导入数据

1.定义问题：
* 通过训练集中给定的某人的收入、家庭、经济状况等数据和未来两年是否存在财政的标记，训练一个模型来判断测试集中某人未来两年出现财政危机的概率。

2.导入数据：

In [None]:
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv', encoding='UTF-8', index_col=0)
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv', encoding='UTF-8', index_col=0)

# 数据探索与预处理

1. 数据探索

首先查看训练集和测试集的规模和数值范围

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.describe()

可以看到，训练集总共150000条数据，测试集共101503条数据。
在数据集的特征中：
* 分类型特征：SeriousDlqin2yrs
* 数值型特征：RevolvingUtilizationOfUnsecuredLines、DebtRatio
* 连续型特征：age、MonthlyIncome
* 离散型特征：NumberOfOpenCreditLinesAndLoans、NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate、NumberOfDependents

同时发现训练集中MonthlyIncome和NumberOfDependents存在空值。那么我们接下来检查数据集中的空值情况

In [None]:
pd.DataFrame({'count':train_data.isnull().sum().values, 'ratio': train_data.isnull().mean() * 100})

In [None]:
pd.DataFrame({'count':test_data.isnull().sum().values, 'ratio': test_data.isnull().mean() * 100})

* 训练集和测试集中MonthlyIncome均缺失大约19%，NumberOfDependents均缺失大约2.6%
* 由于缺失比例较大，我们不能直接删除这两个特征。
* 要填充缺失值，需要先探索这两个特征与其他特征是否存在具体联系。注意到DebtRatio是由MonthlyIncome计算得出的，从而我们可以探索存在缺失值的特征与DebtRatio的关系。

In [None]:
train_data[train_data['MonthlyIncome'].isnull()][['NumberOfDependents', 'DebtRatio']].describe()

In [None]:
train_data[train_data['NumberOfDependents'].isnull()][['MonthlyIncome', 'DebtRatio']].describe()

* 首先注意到在第二个表格中，所有NumberOfDependents为空的数据它们的MonthlyIncome也为空，也就是说所有没有填NumberOfDependents的受访者都没有填他们的MonthlyIncome。
* 其次在第一个表格中，可以看到在MonthlyIncome为空的受访者中他们大多数的NumberOfDependents均为0（前75%都为0，同时在最大值为9的情况下平均值只有0.3）
* 所以我们可以得出结论：没有填MonthlyIncome的人的NumberOfDependents大概率为0。
* 因此在填充缺失值时可以将NumberOfDependents中的空值填0

接下来查看MonthlyIncome和DebtRatio之间的关系

In [None]:
train_data[['DebtRatio']].describe()

* 上表是DebtRatio整体的情况，可以看到前75%的值都小于1，而最大值达到了329664
* 另外看MonthlyIncome非空时DebtRatio的情况

In [None]:
train_data[train_data['MonthlyIncome'].notnull()][['DebtRatio']].describe()

* 对比MonthlyIncome为空时DebtRatio的情况、MonthlyIncome非空时DebtRatio的情况以及DebtRatio的整体情况来看，可以发现MonthlyIncome为空时DebtRatio明显更大
* 从而我们得出结论：将MonthlyIncome留空的受访者通常有较大的DebtRatio

接下来查看DebtRatio大于1时MonthlyIncome为空的情况
* 输出DebtRatio大于1时MonthlyIncome为空的数量占总数的比例

In [None]:
[train_data[train_data['DebtRatio']>1]['MonthlyIncome'].isnull().sum()/len(train_data)*100,
      test_data[test_data['DebtRatio']>1]['MonthlyIncome'].isnull().sum()/len(test_data)*100]

* 训练集和测试集的比例分别为18.6和18.56，也就是说绝大部分MonthlyIncome为空的数据DebtRatio均大于1
* 从而我们可以使用DebtRatio大于1而MonthlyIncome不为空的数据的中位数来填充MonthlyIncome的空值

DebtRatio大于1而MonthlyIncome不为空的数据情况：

In [None]:
# 训练集
train_data[(train_data['DebtRatio']>1) & (train_data['MonthlyIncome'].notnull())]\
 [['MonthlyIncome']].describe()

从而我们将MonthlyIncome的所有空值填充为1577.

接下来查看异常值的情况

首先查看训练集中作为标签的分类特征SeriousDlqin2yrs的分布

In [None]:
sns.countplot('SeriousDlqin2yrs' ,data=train_data)

* 发现标签中0的值占绝大多数，0值和1值的数量比大致为14：1
* 说明数据集中存在正负样例及不均衡的情况
* 这种情况下，我们优先考虑基于Bagging和Boosting的算法
* 另外在评估模型所使用的指标上，我们着重看ROC曲线和AUC值，因为正负样例不均衡时准确率召回率可能会失效

然后依次检查作为特征的变量

**RevolvingUtilizationOfUnsecuredLines**

In [None]:
train_data[['RevolvingUtilizationOfUnsecuredLines']].describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.distplot(x = np.array(train_data['RevolvingUtilizationOfUnsecuredLines']), ax=axes[0])
sns.boxplot(x = train_data['RevolvingUtilizationOfUnsecuredLines'], ax=axes[1])

* 在统计表中我们发现前75%的数据均小于1，而最大值为50708，所以此特征的值中存在离群点
* 在条形图和箱型图中也可以发现离群点的存在

* 通常情况下变量RevolvingUtilizationOfUnsecuredLines取值应该在0到1之间，但是考虑到某些人可能会借超过信用卡额度的贷款，所以大于1的取值也是合理的。
* 我们在这里查看RevolvingUtilizationOfUnsecuredLines值小于1和大于10的分布情况

In [None]:
[train_data[train_data['RevolvingUtilizationOfUnsecuredLines'] < 1]\
      ['RevolvingUtilizationOfUnsecuredLines'].count()/len(train_data)*100,
train_data[train_data['RevolvingUtilizationOfUnsecuredLines'] > 10]
      ['RevolvingUtilizationOfUnsecuredLines'].count()/len(train_data)*100]

* 我们发现，RevolvingUtilizationOfUnsecuredLines小于1的数据占到97.77%，而大于10的数据仅占0.16%
* 但是这部分数据极大的拉高了整体数据的平均值
* 所以我们应将RevolvingUtilizationOfUnsecuredLines大于10的部分作为离群点删去

**DebtRatio**

In [None]:
train_data[['DebtRatio']].describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.distplot(x = np.array(train_data['DebtRatio']), ax = axes[0])
sns.boxplot(x = train_data['DebtRatio'], ax = axes[1])

发现与RevolvingUtilizationOfUnsecuredLines情况相似，DebtRatio同样存在离群点

查看值小于1、大于1小于10、大于10的分布情况

In [None]:
pd.DataFrame({'below 1': train_data[train_data['DebtRatio'] <= 1]['DebtRatio'].count()*100/len(train_data),
             'between 1 - 10': train_data[(train_data['DebtRatio'] > 1) &\
                                        (train_data['DebtRatio'] <=10)]['DebtRatio'].count()*100/len(train_data),\
             'beyond 10': train_data[train_data['DebtRatio'] > 10]['DebtRatio'].count()*100/len(train_data)}, index = [1])

* 可以看到其中值大于10的数据占到接近20%
* 同时我们在填充缺失值时使用了DebtRatio作为判断依据
* 所以我们不应该像处理RevolvingUtilizationOfUnsecuredLines一样把大于10的数据删去，而是将其当作受访者的特殊情况

**age**

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.boxplot(x= train_data['age'], ax = axes[0])
sns.boxplot(x= test_data['age'], ax = axes[1])

* 从箱型图中看到，训练集和测试集中的年龄分布较为正常
* 但是存在一个值为0的数据点，而这是不合理的。（婴儿无法拥有收入，并且无法借贷）
* 所以我们这个数据点的值改为最小可能值，也就是18.

**NumberOfOpenCreditLinesAndLoans**

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.histplot(x = train_data['NumberOfOpenCreditLinesAndLoans'], binwidth=1, ax = axes[0])
sns.histplot(x = test_data['NumberOfOpenCreditLinesAndLoans'], binwidth=1, ax = axes[1])

本特征分布正常，不需要额外操作

**NumberRealEstateLoansOrLines**

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.histplot(x = train_data['NumberRealEstateLoansOrLines'], binwidth=1, ax = axes[0])
sns.histplot(x = test_data['NumberRealEstateLoansOrLines'], binwidth=1, ax = axes[1])

同上，不需要额外操作

**NumberOfDependents**

In [None]:
train_data[['NumberOfDependents']].describe()

In [None]:
sns.boxplot(x = train_data['NumberOfDependents'])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18,6))
sns.histplot(x = train_data['NumberOfDependents'], binwidth=1, ax = axes[0])
sns.histplot(x = test_data['NumberOfDependents'], binwidth=1, ax = axes[1])

发现NumberOfDependents同样存在离群点，但我们在填充缺失值时参考了这些值，所以这里不做处理

**Number of Days Past Due**

In [None]:
plt.figure(figsize=(10, 5))
train_data[['NumberOfTime30-59DaysPastDueNotWorse',
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()

* 发现在30-59天、60-89天、90天的三个特征的数据中均存在离群点
* 通常在90天内很难做到超过90次逾期的情况，所以这里将他们看作不合理的离群点，需要剔除掉

2. 数据预处理

根据以上数据探索的结论，我们对训练集和测试集进行空值填充，并且处理训练集中的异常值

In [None]:
#空值填充
train_data['NumberOfDependents'].replace(np.nan, 0, inplace=True)
test_data['NumberOfDependents'].replace(np.nan, 0, inplace=True)
train_data['MonthlyIncome'].replace(np.nan, 1577, inplace=True)
test_data['MonthlyIncome'].replace(np.nan, 1577, inplace=True)

In [None]:
#删除RevolvingUtilizationOfUnsecuredLines中大于10的数据点
train_data = train_data[train_data['RevolvingUtilizationOfUnsecuredLines'] <= 10]

In [None]:
# 将年龄为0的数据点替换为18
train_data['age'].replace(0, 18, inplace=True)

In [None]:
# 删除NumberOfTimeDaysPastDue中大于90的数据点
train_data = train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse'] < 90]
train_data = train_data[train_data['NumberOfTimes90DaysLate'] < 90]
train_data = train_data[train_data['NumberOfTime60-89DaysPastDueNotWorse'] < 90]

# 选择模型

* 首先，本问题是一个分类问题，同时数据量较为庞大，因而主要采取适用于处理大规模数据的分类方法
* 其次，由于训练集存在正负样本比例不均衡的问题，在选择训练方法时应着重于选择基于Boosting和Bagging的相关算法
* 另外，在评估模型时应使用ROC曲线和AUC值，并且在训练时应使用K折交叉验证法

In [None]:
#导入机器学习模型包
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc, f1_score
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold

In [None]:
#将训练数据集划分为训练集train_x, train_y, 和测试集test_x test_y
from sklearn.model_selection import train_test_split
x = train_data.drop(['SeriousDlqin2yrs'],axis=1)
y= train_data['SeriousDlqin2yrs']
train_x, test_x, train_y, test_y = \
train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

In [None]:
#定于ROC曲线函数
def plot_roc(test_y, predict_y, label = None):
    fpr, tpr, thresholds = roc_curve(test_y, predict_y)
    plt.plot(fpr, tpr, linewidth = 2 ,label = label)
    plt.plot(fpr, fpr, linestyle = '--', color = 'k')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.show()

In [None]:
# 定义K折交叉验证的参数
kfold = StratifiedKFold(n_splits=10)

# 添加候选分类模型
classifiers=[]
classifiers.append(RandomForestClassifier())
classifiers.append(LinearDiscriminantAnalysis())
classifiers.append(XGBClassifier(eval_metric='auc'))
classifiers.append(GradientBoostingClassifier())
classifiers.append(LGBMClassifier())

# 依次训练分类模型,记录结果
# 注意将评分标准设置为roc_auc
cv_results=[]
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier,train_x,train_y,
                                      scoring='roc_auc',cv=kfold,n_jobs=-1))
# 计算结果并显示
cv_means, cv_std = [], []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
cvResDf=pd.DataFrame({'cv_mean':cv_means,
                      'cv_std':cv_std,
                     'algorithm':['Random Forest','LDA','XGBoost','GBDT','LGBM']})

In [None]:
cvResDf

In [None]:
# 可视化结果
cvResFacet=sns.FacetGrid(cvResDf.sort_values(by='cv_mean',ascending=False),sharex=False,
            sharey=False,aspect=2)
cvResFacet.map(sns.barplot,'cv_mean','algorithm',**{'xerr':cv_std},
               palette='muted')
cvResFacet.set(xlim=(0.7,0.9))
cvResFacet.add_legend()

* 在初步结果中我们看到,GBDT和LightGBM的效果最好
* 然而本问题的训练集总共包含15万条数据,在GBDT具有较高计算复杂度的前提下,所以在本问题中使用GBDT会需要很长的计算时间
* 在模型鲁棒性方面,GBDT效果也不如LightGBM
* 因此我们选择LightGBM作为最终的分类模型

# 参数优化

在优化参数部分,我们使用基于10折交叉验证法的网格搜索法来进行优化

待优化的参数有:
* n_estimators: 拟合的树的棵树，即训练轮数
* learning_rate: 学习率
* max_depth: 每个弱学习器也就是决策树的最大深度
* feature_fraction:子特征处理列采样，用来控制过拟合
* num_leaves: 树的最大叶子数，用于控制模型复杂性

n_estimators和learning_rate为模型的外参数,并且二者之间相互联系,所以在优化时将这两个参数一起优化.

In [None]:
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)
LGBM_param_test1 = {'n_estimators':[250,275,300],
               'learning_rate': [0.01, 0.02, 0.05]}
gsearch1 = GridSearchCV(estimator = LGBMClassifier(),
                       param_grid = LGBM_param_test1, scoring='roc_auc',
                        cv=kfold, n_jobs=-1, verbose=1)
gsearch1.fit(train_x,train_y)
print(gsearch1.best_score_)
print(gsearch1.best_params_)

我们将learning_rate设置为0.02, n_estimators设置为300,然后搜索最优max_depth

In [None]:
LGBM_param_test2 = {'max_depth':range(15, 22)}
gsearch2 = GridSearchCV(estimator = LGBMClassifier(n_estimators=300, learning_rate=0.02),
                       param_grid = LGBM_param_test2, scoring='roc_auc',
                        cv=kfold, n_jobs=-1, verbose=1)
gsearch2.fit(train_x,train_y)
print(gsearch2.best_score_)
print(gsearch2.best_params_)

max_depth的最优值是18,然后搜索最优feature_fraction和num_leaves

In [None]:
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)
LGBM_param_test3 = {'feature_fraction':[0.5, 0.6, 0.7],
                    'num_leaves': [25, 30, 35]}
gsearch3 = GridSearchCV(estimator = LGBMClassifier(n_estimators=300, 
                                                   learning_rate=0.02, max_depth=18),
                       param_grid = LGBM_param_test3, scoring='roc_auc',cv=kfold, n_jobs=-1, verbose=1)
gsearch3.fit(train_x,train_y)
print(gsearch3.best_score_)
print(gsearch3.best_params_)

得到最优feature_fraction为 0.5, num_leaves为 30

从而最后得出的参数为:
* n_estimators: 300
* learning_rate: 0.02
* max_depth: 18
* feature_fraction: 0.5
* num_leaves: 30

# 训练模型

我们将优化好的参数设置在模型中,将训练集带入模型进行训练

In [None]:
lgbm = LGBMClassifier(n_estimators=300, learning_rate=0.02, 
                      max_depth=18, feature_fraction=0.5, num_leaves=30)
lgbm.fit(train_x, train_y)
predict_y = lgbm.predict_proba(test_x)[:,1]

查看ROC曲线和AUC值:

In [None]:
plot_roc(test_y, predict_y)

In [None]:
roc_auc_score(test_y, predict_y)

最终在测试集上得到AUC得分为0.8694

# 提交结果

In [None]:
test_df = test_data.drop(['SeriousDlqin2yrs'],axis=1)
lgbm_clf_proba = lgbm.predict_proba(test_df)[:,1]
ids = np.arange(1,101504)
lgbm_submission = pd.DataFrame( {'Id': ids, 'Probability': lgbm_clf_proba})
lgbm_submission.to_csv("lgbm_submission.csv", index=False)

In [None]:
lgbm_submission