In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> ## **整体流程**

1. 问题背景 <br>
    1.1 问题描述<br>
    1.2 特征描述<br>
2. 导入数据<br>
3. 数据观察与分析<br>
    3.1 数据概览<br>
    3.2 缺失值情况<br>
    3.3 异常值情况与特征分析<br>
    3.4 数据相关性<br>
    3.5 数据类别分布情况<br>
4. 数据预处理<br>
    4.1 处理缺失值<br>
    4.2 处理异常值<br>
5. 模型对比<br>
6. 优化调参<br>
7. 提交<br>

> ## **导包**

In [None]:
import math
import csv
import os
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import cross_validate, train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
pd.set_option('display.float_format',lambda x : '%.4f' % x)

import warnings
warnings.filterwarnings('ignore')

## 1. **问题背景** 

> ### 1.1 **问题描述**

Banks play a crucial role in market economies. They decide who can get finance and on what terms and can make or break investment decisions. For markets and society to function, individuals and companies need access to credit. 

Credit scoring algorithms, which make a guess at the probability of default, are the method banks use to determine whether or not a loan should be granted. This competition requires participants to improve on the state of the art in credit scoring, by predicting the probability that somebody will experience financial distress in the next two years.

The goal of this competition is to build a model that borrowers can use to help make the best financial decisions.

银行在市场经济中发挥着至关重要的作用，他们可以决定谁可以获得资金以及以什么条件获得资金。为了让市场和社会发挥作用，个人和公司需要获得信用贷款来满足自身或企业需求。<br>
这个问题的目的在于通过某人月收入、负债情况、逾期记录、家庭情况等一系列相关数据，预测某人在未来两年内遇到财务困境的可能性。

> ### 1.2 **特征描述**

* **SeriousDlqin2yrs**：逾期90天或更糟的人，**分类型特征**
* **RevolvingUtilizationOfUnsecuredLines**：信用卡和个人信用额度的总余额（房地产除外）和无分期付款债务（如汽车贷款）除以信用额度总和，**数值型特征**
* **age**：借款人年龄（年），**连续型特征**
* **NumberOfTime30-59DaysPastDueNotWorse**：借款人逾期 30-59 天但在过去 2 年内没有更糟的次数，**离散型特征**
* **DebtRatio**：每月债务支付、赡养费、生活费用除以每月总收入，**数值型特征**
* **MonthlyMonthly**：月收入，连续型特征
* **NumberOfOpenCreditLinesAndLoans**：未结贷款（如汽车贷款或抵押贷款等分期付款）和信用额度（例如信用卡）的数量，**离散型特征**
* **NumberOfTimes90DaysLate**：借款人逾期 90 天或更长时间的次数，**离散型特征**
* **NumberRealEstateLoansOrLines**：抵押贷款和房地产贷款的数量，包括房屋净值信贷额度，**离散型特征**
* **NumberOfTime60-89DaysPastDueNotWorse**：借款人逾期 60-89 天但在过去 2 年内没有更糟的次数，**离散型特征**
* **NumberOfDependents**：家庭中不包括他们自己（配偶、子女等）的家属人数，**离散型特征**

## 2. **导入数据**

In [None]:
# 导入数据集
train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv',index_col=0)
test = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv',index_col=0)
train

## 3. **数据观察与分析** 

>### 3.1 **数据概览** 

In [None]:
train.shape, test.shape

* 通过对数据的观察，我们可以得到一共11个特征，训练集包含了150000条数据，测试集则包含了101503条数据，接下来我们观察特征的异常值：

In [None]:
train.describe()

* 通过describe函数可以看到所有特征均为数值数据，不需要对标签进行转换，但是其中存在极端值以及缺失值，对此需要进一步分析和解决。

In [None]:
sns.countplot('SeriousDlqin2yrs', data=train)

* 对数据类别标签进行绘图后发现0,1样本之间分布不平衡，比例超过了10:1
* 因此我们需要采用roc_auc作为评分指标，因为常规的准确率和召回率可能会因为类别比例不均衡导致出现偏差
* 对于这种情况我们应该优先使用bagging和boosting算法

>### 3.2 **缺失值情况**

In [None]:
pd.DataFrame({'has NULL':train.isnull().any(axis=0), 'count':train.isnull().sum().values, 'ratio': round((train.isnull().mean()), 4)})

In [None]:
pd.DataFrame({'has NULL':test.isnull().any(axis=0), 'count':test.isnull().sum().values, 'ratio': round((test.isnull().mean()), 4)})

* 进一步观察发现，在11个特征中，有两个特征存在空值，且在训练集和测试集中都分别占比19.82%和2.62%，占比较大，不能直接剔除，同时，150000条数据对应到自变量中的10个特征，不能轻易将该特征忽略，因此，我们接下来首先要分别对MonthlyIncome和NumberOfDependents的空缺值进行填补。

#### MonthlyIncome

In [None]:
train[train['MonthlyIncome'].isnull()][['age','DebtRatio','NumberOfDependents']].describe()

In [None]:
train[train['MonthlyIncome'].notnull()][['age','MonthlyIncome','DebtRatio','NumberOfDependents']].describe()

In [None]:
test[test['MonthlyIncome'].notnull() & (test['DebtRatio']>1)][['age','MonthlyIncome','DebtRatio','NumberOfDependents']].describe()

* 通过对比MonthlyIncome为空和非空时DebtRatio的数据分布不难看出，当MonthlyIncome为空时DebtRatio前25%的值就已经达到了123，而非空时前75%依旧为0.4826，差距非常明显，所以说明MonthlyIncome为空的人大概率DebtRatio的取值也很高，说明收入相对较低且贷款金额大于了自身收入水平。那么我们针对DebtRatio>1的人群进行进一步分析。

In [None]:
debt_ratio = np.arange(1,121,10)
res = []
for debtratio in debt_ratio:
    count = train[train['DebtRatio']>debtratio]['MonthlyIncome'].isnull().sum()
    ratio = count/train['MonthlyIncome'].isnull().sum()
    res.append([count,ratio])
res

In [None]:
test[test['MonthlyIncome'].notnull() & (test['DebtRatio']>1)][['age','MonthlyIncome','DebtRatio','NumberOfDependents']].describe([0.2,0.4,0.6,0.8])

In [None]:
pd.DataFrame({'DebtRatio min-1': train[(train['DebtRatio']<=1) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 1-25': train[(train['DebtRatio']>1) & (train['DebtRatio']<26) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 26-123': train[(train['DebtRatio']>25) & (train['DebtRatio']<124) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 124-1159': train[(train['DebtRatio']>123) & (train['DebtRatio']<1160) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 1160-2382': train[(train['DebtRatio']>1159) & (train['DebtRatio']<2383) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 2383-max': train[(train['DebtRatio']>2382) & (train['MonthlyIncome'].notnull())]['MonthlyIncome'].describe()})

In [None]:
pd.DataFrame({'DebtRatio min-1': test[(test['DebtRatio']<=1) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 1-25': test[(test['DebtRatio']>1) & (test['DebtRatio']<26) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 26-123': test[(test['DebtRatio']>25) & (test['DebtRatio']<124) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 124-1159': test[(test['DebtRatio']>123) & (test['DebtRatio']<1160) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 1160-2382': test[(test['DebtRatio']>1159) & (test['DebtRatio']<2383) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe(),
              'DebtRatio 2383-max': test[(test['DebtRatio']>2382) & (test['MonthlyIncome'].notnull())]['MonthlyIncome'].describe()})

* 分别对不同DebtRatio进行分析发现，不同部分的DebtRatio对应的MonthlyIncome的取值差异非常大，这在训练集和测试集中都存在相同的现象，因此单取整体的平均数作为填充不够合理，我们采用DebtRatio在不同区间内的MonthlyIncome分别用对应区间内的均值进行填充。

#### NumberOfDependents

In [None]:
train[train['NumberOfDependents'].isnull()][['age','MonthlyIncome','DebtRatio','NumberOfDependents']].describe()

* 通过该表不难看出当NumberOfDependents为空值时，MonthlyIncome也同样都为空值，而看MonthlyIncome的数据分布可以得出，当MonthlyIncome为空值时前75%的NumberOfDependents数据为0，而且平均值仅为0.3，说明空缺的NumberOfDependents值绝大多数都应该为0，那么我们就决定将NumberOfDependents的空值全部填0。

>### 3.3 **异常值情况** 

#### DebtRatio 

In [None]:
pd.DataFrame({'train':train['DebtRatio'].describe(), 'test':test['DebtRatio'].describe()})

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(30,20))
# 添加boxplot
ax = sns.boxplot(train['DebtRatio'], ax=axes[0,0])
# 通过stripplot添加分布散点图，jitter设置数据间距
ax = sns.stripplot(train['DebtRatio'], color="orange", jitter=0.2, size=5, ax=axes[0,1])
ax = sns.boxplot(test['DebtRatio'], ax=axes[1,0])
ax = sns.stripplot(test['DebtRatio'], color="orange", jitter=0.2, size=5, ax=axes[1,1])

In [None]:
count = train[train['DebtRatio']>50000]['DebtRatio'].count()
[count, count/150000]

* 通过箱型图，散点图以及计数统计不难看出DebtRatio是存在个别数值过大的离群点的，整体数据集中在[0,1]之间，那么大于50000的数据点只有12个，相对于150000条数据来说可忽略不计，因此将这12个点剔除。

####  RevolvingUtilizationOfUnsecuredLines

In [None]:
pd.DataFrame({'train':train['RevolvingUtilizationOfUnsecuredLines'].describe(),'test':train['RevolvingUtilizationOfUnsecuredLines'].describe()})

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(30,20))
# 添加boxplot
ax = sns.boxplot(train['RevolvingUtilizationOfUnsecuredLines'], ax=axes[0,0])
# 通过stripplot添加分布散点图，jitter设置数据间距
ax = sns.stripplot(train['RevolvingUtilizationOfUnsecuredLines'], color="orange", jitter=0.2, size=5, ax=axes[0,1])
ax = sns.boxplot(test['RevolvingUtilizationOfUnsecuredLines'], ax=axes[1,0])
ax = sns.stripplot(test['RevolvingUtilizationOfUnsecuredLines'], color="orange", jitter=0.2, size=5, ax=axes[1,1])

In [None]:
count = train[train['RevolvingUtilizationOfUnsecuredLines']>5]['RevolvingUtilizationOfUnsecuredLines'].count()
[count, count/150000]

In [None]:
count = test[test['RevolvingUtilizationOfUnsecuredLines']>5]['RevolvingUtilizationOfUnsecuredLines'].count()
[count, count/101503]

* 通过箱型图以及散点分布图，不难看出RevolvingUtilizationOfUnsecuredLines特征的分布集中在[0,1]的区间内，而最大值则为50708，因此我们需要剔除部分离群点来避免极端值带来的影响。这里我们将阈值设定为5，通过count结果得到，在训练集和测试集中，大于5的点均只占比总数据的0.16%左右，占比很小，那么我们就剔除大于5的这部分数据点。

#### age 

In [None]:
pd.DataFrame(train['age'].describe())

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(30,20))
# 添加boxplot
ax = sns.boxplot(train['age'], ax=axes[0,0])
# 通过stripplot添加分布散点图，jitter设置数据间距
ax = sns.stripplot(train['age'], color="orange", jitter=0.2, size=3, ax=axes[0,1])
ax = sns.boxplot(test['age'], ax=axes[1,0])
ax = sns.stripplot(test['age'], color="orange", jitter=0.2, size=5, ax=axes[1,1])

In [None]:
count = train[train['age']==0]['age'].count()
[count, count/train.shape[0]]

In [None]:
count = test[test['age']==0]['age'].count()
[count, count/test.shape[0]]

* 通过散点图和箱型图可以看出age特征的数据数值合理，只有训练集中age=0的点是有问题的，违背现实常理，即0岁的婴儿无法贷款，通过count计数后发现仅有1个为0的点，故直接删除。测试集无相关的情况。

#### NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate

In [None]:
pd.DataFrame({'30-59days':train['NumberOfTime30-59DaysPastDueNotWorse'].describe(),
              '60-89days':train['NumberOfTime60-89DaysPastDueNotWorse'].describe(),
              '90+days':train['NumberOfTimes90DaysLate'].describe()})

In [None]:
pd.DataFrame({'30-59days':test['NumberOfTime30-59DaysPastDueNotWorse'].describe(),
              '60-89days':test['NumberOfTime60-89DaysPastDueNotWorse'].describe(),
              '90+days':test['NumberOfTimes90DaysLate'].describe()})

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(30,30))
# 添加直方图
ax = sns.histplot(train['NumberOfTime30-59DaysPastDueNotWorse'], binwidth=1, ax=axes[0,0])
ax = sns.histplot(test['NumberOfTime30-59DaysPastDueNotWorse'], binwidth=1, ax=axes[0,1])
ax = sns.histplot(train['NumberOfTime60-89DaysPastDueNotWorse'], binwidth=1, ax=axes[1,0])
ax = sns.histplot(test['NumberOfTime60-89DaysPastDueNotWorse'], binwidth=1, ax=axes[1,1])
ax = sns.histplot(train['NumberOfTimes90DaysLate'], binwidth=1, ax=axes[2,0])
ax = sns.histplot(test['NumberOfTimes90DaysLate'], binwidth=1, ax=axes[2,1])

In [None]:
np.unique(train['NumberOfTime30-59DaysPastDueNotWorse']), np.unique(test['NumberOfTime30-59DaysPastDueNotWorse'])

In [None]:
np.unique(train['NumberOfTime60-89DaysPastDueNotWorse']), np.unique(test['NumberOfTime60-89DaysPastDueNotWorse'])

In [None]:
np.unique(train['NumberOfTimes90DaysLate']), np.unique(test['NumberOfTimes90DaysLate'])

* 通过直方图观察以及对这三个特征取值的统计，不难发现这三个特征在训练集和测试集中都存在着96和98两个离群点的取值，在90天内违约超过90次同样不符合现实常理，这一部分离群点占比非常小，因此我们去掉对应的离群点，即删除特征值大于95的点。

####  NumberOfTimeInTotal

In [None]:
NumberOfTimeInTotal_train = train['NumberOfTime30-59DaysPastDueNotWorse']+train['NumberOfTime60-89DaysPastDueNotWorse']+train['NumberOfTimes90DaysLate']
NumberOfTimeInTotal_test = test['NumberOfTime30-59DaysPastDueNotWorse']+test['NumberOfTime60-89DaysPastDueNotWorse']+test['NumberOfTimes90DaysLate']

pd.DataFrame({'train':NumberOfTimeInTotal_train.describe(),'test':NumberOfTimeInTotal_test.describe()})

* 通过对上述NumberOfTime30-59DaysPastDueNotWorse，NumberOfTime60-89DaysPastDueNotWorse，NumberOfTimes90DaysLate三个特征的观察与分析，我认为他们三个特征相似性和相关性很强，特征有些冗余，需要一个额外特征来帮助表示其意义，因此增加额外特征NumberOfTimesInTotal=NumberOfTime30-59DaysPastDueNotWorse+NumberOfTime60-89DaysPastDueNotWorse+NumberOfTimes90DaysLate.

####  NumberOfOpenCreditLinesAndLoans

In [None]:
pd.DataFrame({'train':train['NumberOfOpenCreditLinesAndLoans'].describe(),'test':test['NumberOfOpenCreditLinesAndLoans'].describe()})

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(30,10))
# 添加直方图
ax = sns.histplot(train['NumberOfOpenCreditLinesAndLoans'], binwidth=1, ax=axes[0])
ax = sns.histplot(test['NumberOfOpenCreditLinesAndLoans'], binwidth=1, ax=axes[1])

In [None]:
np.unique(train['NumberOfOpenCreditLinesAndLoans']), np.unique(test['NumberOfOpenCreditLinesAndLoans'])

* 通过对直方图和特征数据值的观察，NumberOfOpenCreditLinesAndLoans特征无离群点，数据分布合理，不做其他处理。

####  NumberRealEstateLoansOrLines

In [None]:
pd.DataFrame({'train':train['NumberRealEstateLoansOrLines'].describe(),'test':test['NumberRealEstateLoansOrLines'].describe()})

In [None]:
np.unique(train['NumberRealEstateLoansOrLines']), np.unique(test['NumberRealEstateLoansOrLines'])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(30,10))
# 添加直方图
ax = sns.histplot(train['NumberRealEstateLoansOrLines'], binwidth=1, ax=axes[0])
ax = sns.histplot(test['NumberRealEstateLoansOrLines'], binwidth=1, ax=axes[1])

* 通过对直方图和特征数据值的观察，NumberRealEstateLoansOrLines特征无离群点，数据分布合理，不做其他处理。

####  NumberOfDependents

In [None]:
pd.DataFrame({'train':train['NumberOfDependents'].describe(),'test':test['NumberOfDependents'].describe()})

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(30,20))
# 添加boxplot
ax = sns.boxplot(train['NumberOfDependents'], ax=axes[0,0])
# 通过stripplot添加分布散点图，jitter设置数据间距
ax = sns.stripplot(train['NumberOfDependents'], color="orange", jitter=0.2, size=3, ax=axes[0,1])
ax = sns.boxplot(test['NumberOfDependents'], ax=axes[1,0])
ax = sns.stripplot(test['NumberOfDependents'], color="orange", jitter=0.2, size=5, ax=axes[1,1])

* 在训练集中，NumberOfDependents的最大取值为20，家属人数为20人在现实中还算合理，故不对其做额外处理。

>### 3.4 **数据相关性**

In [None]:
# 特征相关性热力图
correlation = train.corr()
f , ax = plt.subplots(figsize = (10, 10))
plt.title('Feature Correlation',y=1,size=12)
sns.heatmap(correlation,annot = True, vmax=0.8)

* 通过热力图可以看出NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,NumberOfTimes90DaysLate三个特征之间相关性很强，因此证明需要添加额外的特征NumberOfTimesIntotal。

## 4. **数据预处理** 

In [None]:
train.shape,test.shape

>### 4.1 **处理缺失值**

In [None]:
# 处理NumberOfDependents的缺失值
train['NumberOfDependents'] = train['NumberOfDependents'].fillna(0)

In [None]:
# 处理MonthlyIncome的缺失值
# 方法1：将全部缺失值填为DebtRatio>1 的非空MonthlyIncome的均值

# train['MonthlyIncome'] = train['MonthlyIncome'].fillna(1577)
# test['MonthlyIncome'] = test['MonthlyIncome'].fillna(1176)

# 方法2：针对不同区间的DebtRatio填入不同区间内对应非空MonthlyIncome的均值
train.loc[(train['DebtRatio']<=1) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 6952.4914
train.loc[(train['DebtRatio']>1) & (train['DebtRatio']<=25) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 3066.7276
train.loc[(train['DebtRatio']>25) & (train['DebtRatio']<=123) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 326.1439
train.loc[(train['DebtRatio']>123) & (train['DebtRatio']<=11659) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 1.0041
train.loc[(train['DebtRatio']>1159) & (train['DebtRatio']<=2382) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 0.2975
train.loc[(train['DebtRatio']>2382) & (train['MonthlyIncome'].isnull()),'MonthlyIncome'] = 0.0877

# # 方法3：应用随机森林模型，以其他10个特征为自变量，MonthlyIncome为因变量，拟合预测空缺的MonthlyIncome
# columns = [*train.columns]
# columns.remove('SeriousDlqin2yrs')
# columns.remove('MonthlyIncome')
# X = train.loc[:, columns]
# y = train.loc[:, 'MonthlyIncome']
# X_train = X.loc[train['MonthlyIncome'].notnull()]
# y_train = y.loc[train['MonthlyIncome'].notnull()]
# X_pred = X.loc[train['MonthlyIncome'].isnull()]
# rfr = RandomForestRegressor(random_state=2021, n_estimators=600,max_depth=9, n_jobs=-1)
# rfr.fit(X_train, y_train)
# y_pred = rfr.predict(X_pred).round()
# train.loc[train['MonthlyIncome'].isnull(), 'MonthlyIncome'] = y_pred

* 在处理MonthlyIncome时，选取了三种方法，分别是：
* 方法1：将全部缺失值填为DebtRatio>1 的非空MonthlyIncome的均值
* 方法2：针对不同区间的DebtRatio填入不同区间内对应非空MonthlyIncome的均值
* 方法3：应用随机森林模型，以其他10个特征为自变量，MonthlyIncome为因变量，拟合预测空缺的MonthlyIncome

通过对比在Lightgbm中的roc_auc评分，最终确定方法2最优。

>### 4.2 **处理异常值**

In [None]:
# 新增额外特征NumberOfTimeInTotal
train['NumberOfTimeInTotal'] = NumberOfTimeInTotal_train
test['NumberOfTimeInTotal'] = NumberOfTimeInTotal_test

In [None]:
# 将DebtRatio中大于50000的数据点删除
train = train[train['DebtRatio']<50000]
# 将RevolvingUtilizationOfUnsecuredLines中大于5的数据点删除
train = train[train['RevolvingUtilizationOfUnsecuredLines'] <= 5]
# 将age为0的数据点删除
train = train[train['age'] > 0]
# 将NumberOfTimeDaysPastDue中大于95的数据点删除
train = train[train['NumberOfTime30-59DaysPastDueNotWorse'] < 95]
train = train[train['NumberOfTimes90DaysLate'] < 95]
train = train[train['NumberOfTime60-89DaysPastDueNotWorse'] < 95]

y = train.loc[:,['SeriousDlqin2yrs']]
train = train.drop(['SeriousDlqin2yrs'],axis=1, inplace=False)

## 5. **模型对比**

In [None]:
train.shape, y.shape

In [None]:
x_train,x_val,y_train,y_val = train_test_split(train,y,test_size=0.2, random_state=2021)
x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
# 模型选择
model_classifiers = []
# KNN
model_classifiers.append(KNeighborsClassifier())
# Logistic Regression
# model_classifiers.append(LogisticRegression())
# Simple Decision Tree
model_classifiers.append(DecisionTreeClassifier())
# Random Forest
model_classifiers.append(RandomForestClassifier())
# GBDT
model_classifiers.append(GradientBoostingClassifier())
# LightGBM
model_classifiers.append(LGBMClassifier())
# XGBoost
model_classifiers.append(XGBClassifier())

roc_auc_score = []
for classifier in model_classifiers:
    roc_auc_score.append(cross_validate(classifier, x_train, y_train, cv=15, scoring='roc_auc', n_jobs=-1)['test_score'])

cv_score_mean = []
for score in roc_auc_score:
    cv_score_mean.append(score.mean())
# pd.DataFrame({'model':['KNN','Logistic Regression','Decision Tree','Random Forest','GBDT','LightGBM','XGBoost'],
#               'cv_mean':cv_score_mean})
pd.DataFrame({'model':['KNN','Decision Tree','Random Forest','GBDT','LightGBM','XGBoost'],
              'cv_mean':cv_score_mean})

* 首先对一般分类常见模型采用默认参数进行训练分类，查看不同模型对应评分，采取最优模型进行进一步优化。

## 6. **模型优化与调参** 

* 根据默认参数跑分结果，我们采用效果最好的Lightgbm进行深入优化，其中每一步都采用KFold=15进行交叉验证

#### n_estimators, learning_rate

In [None]:
# param_grid = {'n_estimators': range(200, 800, 100),
#               'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]
#              }

# estimator = lgb.LGBMClassifier()

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)
# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

* 对学习器数量（n_estimators）和学习率（learning_rate）一起粗调优

In [None]:
# param_grid = {'n_estimators': range(500, 700, 25)
#              }

# estimator = lgb.LGBMClassifier(objective='binary',
#                         learning_rate=0.01,)

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)
# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

* 对学习器数量（n_estimators）一起细调优

#### max_depth, num_leaves

In [None]:
# param_grid = {'max_depth': range(4, 21, 2),
#               'num_leaves': range(20, 80, 20)
#              }

# estimator = lgb.LGBMClassifier(objective='binary',
#                         learning_rate=0.01,
#                         n_estimators=575,
#                         )

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)
# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

* 对最大深度和叶子数量一起粗调优
* 在得到最优值{'max_depth': 6, 'num_leaves': 20}后，进行更细粒度的调优

In [None]:
# param_grid = {'max_depth': [5,6,7],
#               'num_leaves': range(10, 21, 2)
#              }

# estimator = lgb.LGBMClassifier(learning_rate=0.01,
#                         n_estimators=575,
#                         )

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)
# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

#### min_child_samples, min_child_weight

* 随后分别对min_child_samples和min_child_weight两个参数进行调优，这两个参数都是为了降低过拟合
* min_data_in_leaf：是一个很重要的参数, 也叫min_child_samples，它的值取决于训练数据的样本个数和num_leaves。将其设置的较大可以避免生成一个过深的树, 但有可能导致欠拟合。
* min_sum_hessian_in_leaf：也叫min_child_weight，使一个结点分裂的最小海森值之和（Minimum sum of hessians in one leaf to allow a split. Higher values potentially decrease overfitting）。

In [None]:
# # 降低过拟合
# param_grid = {'min_child_samples': [20,22,24],
#               'min_child_weight':[0.0001, 0.00025, 0.0005, 0.00025,0.0001]
#                  }

# estimator = lgb.LGBMClassifier(objective='binary',
#                         learning_rate=0.01,
#                         n_estimators=600,
#                         max_depth=6,
#                         num_leaves=16,
#                         subsample=0.8,
#                         colsample_bytree=0.8,
#                         )

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)

# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

#### feature_fraction, bagging_fraction, bagging_freq

* 之后我们再对抽样参数进行调优

* feature_fraction参数来进行特征的子抽样。这个参数可以用来防止过拟合及提高训练速度。
* bagging_fraction+bagging_freq参数必须同时设置，bagging_fraction相当于subsample样本采样，可以使bagging更快的运行，同时也可以降拟合。
* bagging_freq默认0，表示bagging的频率，0意味着没有使用bagging，k意味着每k轮迭代进行一次bagging。

In [None]:
# 降低过拟合
# param_grid={
#     'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
#     'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
#     'bagging_freq': [3,4,5,6]
# }

# estimator = lgb.LGBMClassifier(objective='binary',
#                             learning_rate=0.01,
#                             n_estimators=575,
#                             max_depth=6,
#                             num_leaves=16,
#                             min_child_samples=22,
#                             min_child_weight=0.0001,
#                             )

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)
# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

#### reg_alpha, reg_lambda

* 最后我们加入正则化参数进一步降低过拟合

In [None]:
# 加入正则化参数
# param_grid = {'reg_alpha': [0, 0.01, 0.5],
#               'reg_lambda': [0, 0.01, 0.5]
#               }

# estimator = lgb.LGBMClassifier(objective='binary',
#                             learning_rate=0.01,
#                             n_estimators=575,
#                             max_depth=6,
#                             num_leaves=16,
#                             bagging_fraction=0.6, 
#                             feature_fraction=0.5,
#                             bagging_freq=5,
#                             )

# gbm = GridSearchCV(estimator, param_grid, scoring='roc_auc', cv=15)
# gbm.fit(x_train, y_train)

# print('Best parameters found by grid search are:', gbm.best_params_, gbm.best_score_)

调优完成后最终模型参数为：
* learning_rate=0.01,
* n_estimators=575,
* max_depth=6,
* num_leaves=16,
* min_child_samples=22,
* min_child_weight=0.0001,
* bagging_fraction=0.6, 
* feature_fraction=0.5,
* bagging_freq=5,
* reg_alpha=0.5,
* reg_lambda=0.01

In [None]:
lgbm = LGBMClassifier(learning_rate=0.01,
                            n_estimators=575,
                            max_depth=6,
                            num_leaves=16,
                            bagging_fraction=0.6, 
                            feature_fraction=0.5,
                            bagging_freq=5,
                            min_child_samples=22,
                            min_child_weight=0.0001,
                            reg_alpha=0.5,
                            reg_lambda=0.01)

lgbm.fit(x_train,y_train)
y_pred = lgbm.predict_proba(x_val)[:,1]

* 绘制模型ROC曲线

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_pred)
plt.plot(fpr, tpr)
plt.plot(fpr, fpr, linestyle = '--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')
plt.show()

In [None]:
metrics.roc_auc_score(y_val, y_pred)

## 7. **提交** 

* 对测试集数据进行预处理

In [None]:
test['NumberOfDependents'] = test['NumberOfDependents'].fillna(0)
test.loc[(test['DebtRatio']<=1) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 7140.7728
test.loc[(test['DebtRatio']>1) & (test['DebtRatio']<=25) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 2999.2432
test.loc[(test['DebtRatio']>25) & (test['DebtRatio']<=123) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 70.0508
test.loc[(test['DebtRatio']>123) & (test['DebtRatio']<=11659) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 5.6083
test.loc[(test['DebtRatio']>1159) & (test['DebtRatio']<=2382) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 0.2819
test.loc[(test['DebtRatio']>2382) & (test['MonthlyIncome'].isnull()),'MonthlyIncome'] = 0.0927

test['NumberOfTimeInTotal'] = NumberOfTimeInTotal_test
test = test.drop(['SeriousDlqin2yrs'],axis=1, inplace=False)

* 预测结果并输出csv文件

In [None]:
y_test = lgbm.predict_proba(test)[:,1]

In [None]:
index = np.arange(1,101504)
submission = pd.DataFrame( {'Id': index, 'Probability': y_test})
submission.to_csv("y_submission.csv", index=False)