In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. 导包

In [None]:
from  sklearn.ensemble import RandomForestRegressor
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import re 

# 2. 导入数据

In [None]:
# 导入训练集和测试集数据，第一列去掉
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv',index_col=0)
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv',index_col=0)

# 3. 整理、准备、清洗数据

## 3.1 查看数据集包含哪些特征

In [None]:
print(train_data.info())

In [None]:
print(test_data.info())

可以看到测试集与训练集中数据的特征属性完全一致

| 字段名 | 定义                                 | 值                                            |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs | 个人经历了超过90天的拖欠或者更糟的情况            | Y/N                                |
| RevolvingUtilizationOfUnsecuredLines   | 信用卡和个人信贷余额的总余额，减去房地产和没有分期付款的债务（如汽车贷款），除以信用总和                               | percentage                      |
| age | 年龄 | integer
| NumberOfTime30-59DaysPastDueNotWorse      | 借款人逾期30-59天的次数，但在过去的两年内没有更差的信用记录     |    integer        | 
| DebtRatio      | 每月债务支付、赡养费和生活费费用之和除以月总收入 | percentage |
| MonthlyIncome    | 月收入  |   real  |
| NumberOfOpenCreditLinesAndLoans    | 开放贷款的数量和信用额度 | integer  |
| NumberOfTimes90DaysLate   | 借款人逾期90天及以上的次数 | integer |
| NumberRealEstateLoansOrLines     | 抵押贷款和房地产贷款的数量 | integer |
| NumberOfTime60-89DaysPastDueNotWorse    |借款人逾期60-89天的次数，但在过去的两年内没有更差的信用记录     |    integer        | 
| NumberOfDependents | 不包括自己在内的家属人数 | integer |


## 3.2 检查数据

### 3.2.1 检查训练集中是否有重复值

In [None]:
train_data.duplicated().value_counts()

训练集中存在609个重复样本

### 3.2.2 检查是否有离群值（异常值，噪声等）、缺失值等

In [None]:
train_data.describe()

使用describe()函数，查看训练集的缺失值、均值和中位数等。观察可以发现：
1. 训练集train_data中年龄age的最小值min为0，0岁的婴儿不可能申请到贷款。
2. 变量MonthlyIncome和NumberOfDependents存在缺失值，因为样本总数是150000，而变量MonthlyIncome数量为120269，变量NumberOfDependents数量为146076。所以变量MonthlyIncome共有缺失值29731个，NumberOfDependents有共有缺失值3924个。
3. 逾期次数有几个值很离谱，NumberOfTime30-59DaysPastDueNotWorse表示逾期30-59天，NumberOfTimes90DaysLate表示逾期90天以上， NumberOfTime60-89DaysPastDueNotWorse表示逾期60-89天，它们的最大值均为98。因为样本记录的是过去两年内借款人的逾期情况，所以两年内逾期超过90天次数达到98次基本不合理。

In [None]:
test_data.describe()

用同样的方法，查看测试集的缺失值、均值和中位数等。

根据以上分析，检查逾期次数的分布

In [None]:
# 训练集
plt.figure(figsize=(19, 12)) 
train_data[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

可见超过90的点确实是异常值，需要进一步处理

## 3.3 处理数据

### 3.3.1 删去训练集中的重复值

In [None]:
train_data = train_data.drop_duplicates()
train_data.count()

可以看到，删去重复值之后还剩149391条样本数据

### 3.3.2 处理异常值

一般成年人才有资格贷款，删去训练集中年龄为未成年人的值

In [None]:
train_data = train_data[train_data['age'] > 18]

用中位数替换数据集中逾期次数的离群值

In [None]:
def replace90to100(column):
    new = []
    newval = column.median()
    for i in column:
        if (i >= 90 and i <= 100):
            new.append(newval)
        else:
            new.append(i)
    return new

train_data['NumberOfTime30-59DaysPastDueNotWorse'] = replace90to100(train_data['NumberOfTime30-59DaysPastDueNotWorse'])
train_data['NumberOfTime60-89DaysPastDueNotWorse'] = replace90to100(train_data['NumberOfTime60-89DaysPastDueNotWorse'])
train_data['NumberOfTimes90DaysLate'] = replace90to100(train_data['NumberOfTimes90DaysLate'])

### 3.3.3 处理缺失值

用平均值填充月收入MonthlyIncome中的缺失值

In [None]:
train_data['MonthlyIncome'] = train_data['MonthlyIncome'].replace(np.nan,train_data['MonthlyIncome'].mean())
test_data['MonthlyIncome'] = test_data['MonthlyIncome'].replace(np.nan,test_data['MonthlyIncome'].mean())

用中位数填充家属人数NumberOfDependents中的缺失值

In [None]:
train_data['NumberOfDependents'].fillna(train_data['NumberOfDependents'].median(), inplace=True)
test_data['NumberOfDependents'].fillna(test_data['NumberOfDependents'].median(), inplace=True)

检查处理结果，查看数据集中样本各字段属性的个数

In [None]:
train_data.info()

In [None]:
test_data.info()

可以看到目前不存在空值

## 3.4 分析相关性

In [None]:
sns.heatmap(train_data.corr(),cmap="coolwarm",annot=False)

#### 分析结果：
1. 观察上面图表可以发现，训练集各字段属性之间目前不存在较强的相关性。
2. 再分析第一行与第一列，即SeriousDlqin2yrs和其它属性的相关性，可以发现SeriousDlqin2yrs和几个逾期的天数（30-59,60-89,90以上）有较大的关系（颜色在0.4-0.6之间），而与其它属性关系不大。

# 4. 建模、预测、求解问题

## 4.1 分离数据

### 4.1.1  在训练集中分离出X_train和y_train, 用测试集定义X_test

In [None]:
X_train = train_data.iloc[:,1:].values
y_train = train_data.iloc[:,0].values
X_test = test_data.iloc[:,1:].values

X_train.shape, y_train.shape, X_test.shape 

### 4.1.2 归一化

In [None]:
from sklearn import preprocessing

print('X_train:')
train_scaler = preprocessing.StandardScaler().fit(X_train)
print('mean_:','\n', train_scaler.mean_)
print('scale_:','\n', train_scaler.scale_)

print('\n', '='*50,'\n')

print('X_test:')
test_scaler = preprocessing.StandardScaler().fit(X_test)
print( 'mean_:','\n', test_scaler.mean_ )
print( 'scale_:','\n', test_scaler.scale_)

In [None]:
X_train_scaled = train_scaler.transform(X_train)
X_test_scaled = test_scaler.transform(X_test)

X_train_scaled.mean(axis=0), X_train_scaled.std(axis=0), X_test_scaled.mean(axis=0), X_test_scaled.std(axis=0)

## 4.2 划分训练集

从X_train, y_train划分出训练集X_learn, y_learn; 验证集 X_valid, y_valid

In [None]:
from sklearn.model_selection import train_test_split

X_learn, X_valid, y_learn, y_valid = train_test_split(X_train_scaled, y_train, random_state=0)
X_learn.shape, X_valid.shape, y_learn.shape, y_valid.shape

## 4.3 建立模型

考虑三种常见模型，朴素贝叶斯、随机森林和LGBM

### 4.3.0 定义绘制ROC曲线的函数，导入必要的包

In [None]:
def draw_roc(FPR, TPR, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(FPR, TPR,'b', linewidth=2, label=label)
    plt.plot([0,1],[0,1], "r--") 
    plt.xlim([0, 1])   
    plt.ylim([0, 1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

### 4.3.1 朴素贝叶斯

In [None]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X_learn, y_learn)
y_pred = gaussian.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

In [None]:
# 绘制ROC曲线
FPR_gaussian, TPR_gaussian, THRESH_gaussian = roc_curve(y_valid, y_pred)
draw_roc(FPR_gaussian, TPR_gaussian)

### 4.3.2 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':[30,50,80,100,200]
    },
    scoring='roc_auc',
    verbose=3
)

grid.fit(X_learn, y_learn)
for result in grid.cv_results_:
    print(result, grid.cv_results_[result])
grid.best_params_['n_estimators']

In [None]:
rfc = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'])
rfc.fit(X_learn, y_learn)
y_pred = rfc.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

In [None]:
# 绘制ROC曲线
FPR_rf, TPR_rf, THRESH_rf = roc_curve(y_valid, y_pred)
draw_roc(FPR_rf, TPR_rf)

### 4.3.3 LGBM

In [None]:
import lightgbm as lgb

grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(40,60)
    },
    scoring='roc_auc',
    verbose=1
)

grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
clf = lgb.LGBMClassifier(n_estimators=grid.best_params_['n_estimators'])
clf.fit(X_learn, y_learn)
y_pred = clf.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

In [None]:
# 绘制ROC曲线
FPR_lgbm, TPR_lgbm, THRESH_lgbm = roc_curve(y_valid, y_pred)
draw_roc(FPR_lgbm, TPR_lgbm)

## 4.4 优化最优模型

对前面三个模型中表现最好的LGBM进行优化

In [None]:
grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(45,55),
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3]
    },
    scoring='roc_auc',
    verbose=1
)
grid.fit(X_learn, y_learn)

grid.best_params_['n_estimators'], grid.best_params_['learning_rate']

# 5. 提交

In [None]:
clf = lgb.LGBMClassifier(n_estimators=grid.best_params_['n_estimators'],
                         learning_rate=grid.best_params_['learning_rate'])
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]

sample = pd.read_csv('../input/GiveMeSomeCredit/sampleEntry.csv')
sample['Probability'] = y_pred
sample.to_csv('./submit.csv',index=False)
result = pd.read_csv('./submit.csv')
result