In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 六步流程

1. 定义问题
2. 获取训练数据和测试数据
3. 整理、准备、清洗数据
4. 建模、预测、求解问题
5. 优化
6. 提交

# 1. 定义问题

Give Me Some Credit一个关于信用评分的项目，用于预测未来两年内，借款人是否会遇到财务困境。
在现代社会中，为了市场和社会的运作，个人和公司有时需要获得信贷，而银行是发放贷款的正规机构。银行在发放对借款人进行综合评估，此时即需要一个信用评分算法用于猜测借款人的违约概率。本项目即对借款人的各项信息进行综合分析，预测未来两年内，借款人是否会遇到财务困境，从而让银行决定是否发放贷款。

# 2. 获取训练数据和测试数据

## 2.1 导包

In [None]:
from  sklearn.ensemble import RandomForestRegressor
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import re 

## 2.2 导入训练集和测试集

In [None]:
# 导入训练集和测试集数据，第一列去掉
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv',index_col=0)
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv',index_col=0)

In [None]:
# 查看训练集属性
train_data.info()

In [None]:
# 查看测试集属性
print(test_data.info())

# 3. 整理、准备、清洗数据

## 3.1 分析数据

查看数据集包含哪些特征

In [None]:
print(train_data.columns.values)

In [None]:
print(test_data.columns.values)

可以看到测试集与训练集中数据的特征属性完全一致

| 字段名 | 定义                                 | 值                                            |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs | 个人经历了超过90天的拖欠或者更糟的情况            | Y/N                                |
| RevolvingUtilizationOfUnsecuredLines   | 信用卡和个人信贷余额的总余额，减去房地产和没有分期付款的债务（如汽车贷款），除以信用总和                               | percentage                      |
| age | 年龄 | integer
| NumberOfTime30-59DaysPastDueNotWorse      | 借款人逾期30-59天的次数，但在过去的两年内没有更差的信用记录     |    integer        | 
| DebtRatio      | 每月债务支付、赡养费和生活费费用之和除以月总收入 | percentage |
| MonthlyIncome    | 月收入  |   real  |
| NumberOfOpenCreditLinesAndLoans    | 开放贷款的数量和信用额度 | integer  |
| NumberOfTimes90DaysLate   | 借款人逾期90天及以上的次数 | integer |
| NumberRealEstateLoansOrLines     | 抵押贷款和房地产贷款的数量 | integer |
| NumberOfTime60-89DaysPastDueNotWorse    |借款人逾期60-89天的次数，但在过去的两年内没有更差的信用记录     |    integer        | 
| NumberOfDependents | 不包括自己在内的家属人数 | integer |

## 3.2 检查数据

### 3.2.1 检查训练集中是否有重复值

In [None]:
train_data.duplicated().value_counts()

False表示某样本第一次出现，而True表示某样本之前已经出现，说明训练集中存在重复样本

### 3.2.2 检查是否有离群值（异常值，噪声等）、缺失值等

In [None]:
train_data.describe()

使用describe()函数，查看训练集的缺失值、均值和中位数等。观察可以发现：
1. 训练集train_data中年龄age的最小值min为0，是异常值，可以用中位数替换；
2. 变量MonthlyIncome和NumberOfDependents存在缺失值，因为样本总数是150000，而变量MonthlyIncome数量为120269，变量NumberOfDependents数量为146076。所以变量MonthlyIncome共有缺失值29731个，NumberOfDependents有共有缺失值3924个。

## 3.3 处理数据

### 3.3.1 删去训练集中的重复值

In [None]:
train_data = train_data.drop_duplicates()
train_data.count()

可以看到，删去重复值之后还剩149391条样本数据

### 3.3.2 处理异常值

In [None]:
# 训练集中年龄出现了异常值0，不合理，删去该条数据
train_data = train_data[train_data['age'] > 0]

### 3.3.3 处理缺失值

用平均值填充月收入MonthlyIncome中的缺失值

In [None]:
train_data['MonthlyIncome'] = train_data['MonthlyIncome'].replace(np.nan,train_data['MonthlyIncome'].mean())

用中位数填充家属人数NumberOfDependents中的缺失值

In [None]:
train_data['NumberOfDependents'].fillna(train_data['NumberOfDependents'].median(), inplace=True)

检查处理结果，查看数据集中样本各字段属性的个数

In [None]:
train_data.info()

可以看到目前不存在空值

## 3.4 分析相关性

In [None]:
# 检查训练集中不同字段属性对于彼此的相关性
corr_train = train_data.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr_train, annot=True, fmt='.2g')

观察图表可以发现，训练集的'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'(逾期30-59天，60-89天，90天以上) 三个字段存在极大的相关性

In [None]:
#  查看训练集中 逾期30-59天，60-89天，90天以上 三个字段的箱型图
plt.figure(figsize=(19, 12)) 
train_data[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

观察可以发现，训练集的三个字段都分别存在两个离群点，大约在90至100之间，去掉这两个离群点。

In [None]:
# 用中位数替换数据集中 【逾期30-59天，60-89天，90天以上】这三个字段的【90和100】之间的两个点
def replace90to100(column):
    new = []
    newval = column.median()
    for i in column:
        if (i >= 90 and i <= 100):
            new.append(newval)
        else:
            new.append(i)
    return new

train_data['NumberOfTime30-59DaysPastDueNotWorse'] = replace90to100(train_data['NumberOfTime30-59DaysPastDueNotWorse'])
train_data['NumberOfTime60-89DaysPastDueNotWorse'] = replace90to100(train_data['NumberOfTime60-89DaysPastDueNotWorse'])
train_data['NumberOfTimes90DaysLate'] = replace90to100(train_data['NumberOfTimes90DaysLate'])

In [None]:
# 再次查看训练集不同字段属性对于彼此的相关性
corr_train = train_data.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr_train, annot=True, fmt='.2g')

观察上面图表可以发现，训练集的'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'(逾期30-59天，60-89天，90天以上) 三个字段的相关性已经被消除

# 4. 建模、预测、求解问题

## 4.1 分离数据

### 4.1.1  在训练集中分离出X_train和y_train, 用测试集定义X_test

In [None]:
X_train = train_data.iloc[:,1:].values
y_train = train_data.iloc[:,0].values
X_test = test_data.iloc[:,1:].values

X_train.shape, y_train.shape, X_test.shape 

### 4.1.2 归一化

In [None]:
from sklearn import preprocessing

print('X_train:')
train_scaler = preprocessing.StandardScaler().fit(X_train)
print('mean_:','\n', train_scaler.mean_)
print('scale_:','\n', train_scaler.scale_)

print('\n', '='*50,'\n')

print('X_test:')
test_scaler = preprocessing.StandardScaler().fit(X_test)
print( 'mean_:','\n', test_scaler.mean_ )
print( 'scale_:','\n', test_scaler.scale_)

In [None]:
X_train_scaled = train_scaler.transform(X_train)
X_test_scaled = test_scaler.transform(X_test)

X_train_scaled.mean(axis=0), X_train_scaled.std(axis=0), X_test_scaled.mean(axis=0), X_test_scaled.std(axis=0)

## 4.2 划分训练集
从X_train_scaled, y_train划分出子训练集X_learn, y_learn; 子验证集 X_valid, y_valid

In [None]:
from sklearn.model_selection import train_test_split

X_learn, X_valid, y_learn, y_valid = train_test_split(X_train_scaled, y_train, random_state=0)
X_learn.shape, X_valid.shape, y_learn.shape, y_valid.shape

In [None]:
from sklearn.metrics import roc_auc_score #使用roc_auc 作为 metric
from sklearn.model_selection import GridSearchCV

## 4.3 建立模型

### 4.3.1 高斯（朴素贝叶斯）

In [None]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = gaussian.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

### 4.3.2 逻辑回归

In [None]:
# 逻辑回归
from sklearn.linear_model import LogisticRegressionCV

arg, maxauc = 'none', 0
for s in ['newton-cg', 'lbfgs', 'liblinear']:
    model = LogisticRegressionCV(scoring='roc_auc', solver=s)
    model.fit(X_learn, y_learn)
    
    # 在子验证集上测试
    y_pred = model.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(s, score)
    if score > maxauc:
        arg, maxauc = s, score
print()
print(arg, maxauc)

### 4.3.3 决策树

In [None]:
from sklearn.tree import DecisionTreeClassifier


grid = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid={
        'criterion':['gini','entropy']
    },
    scoring='roc_auc',
    verbose=3
)
grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
tree = DecisionTreeClassifier(criterion=grid.best_params_['criterion'])
tree.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = tree.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

决策树的得分比较低

### 4.3.4 随机森林

In [None]:
from sklearn.ensemble import RandomForestClassifier

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':[30,50,80,100,200]
    },
    scoring='roc_auc',
    verbose=3
)
grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
rfc = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'])
rfc.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = rfc.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

### 4.3.5 GDBT之Lightgbm

In [None]:
import lightgbm as lgb

grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(40,60)
    },
    scoring='roc_auc',
    verbose=1
)
grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
clf = lgb.LGBMClassifier(n_estimators=grid.best_params_['n_estimators'])
clf.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = clf.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

# 5. 优化

In [None]:
lgb_grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(45,55),
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3]
    },
    scoring='roc_auc',
    verbose=1
)
lgb_grid.fit(X_learn, y_learn)
lgb_grid.best_params_['n_estimators'], lgb_grid.best_params_['learning_rate']

In [None]:
# 超参数调节

from sklearn.model_selection import GridSearchCV, StratifiedKFold
LGB = lgb.LGBMClassifier()

## Search grid for optimal parameters
lgb_param_grid = {
    'n_estimators':range(lgb_grid.best_params_['n_estimators']),
    'learning_rate':[lgb_grid.best_params_['learning_rate']]
}

# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)


gsLGB = GridSearchCV(LGB,param_grid = lgb_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsLGB.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred_gslgb = gsLGB.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred_gslgb)
print(score)

# 6. 提交

In [None]:
# 将fit的参数换成X_train, y_train，对测试集X_test进行预测
gsLGB.fit(X_train, y_train)
y_pred_gsLGB = gsLGB.predict_proba(X_test)[:,1]

In [None]:
# 输出预测结果
sample = pd.read_csv('../input/GiveMeSomeCredit/sampleEntry.csv')
sample['Probability'] = y_pred_gsLGB
sample.to_csv('./my_submit.csv', index = False)