In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 主要分为4个步骤
## 1.题目理解
## 2.数据预处理及特征工程
## 3.模型构建及验证
## 4.结果提交

# 1.题目理解
本次数据集提供了25万名借款人的历史数据，训练集15万，测试集10万。
通过训练集训练，根据个人特征，预测某人在未来两年内遇到财务困境的可能性，从而决定是否授予贷款。
求每个信贷人违约的概率，评价指标为AUC。

### 导入常用包

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")

# 2.数据预处理及特征工程

#### 数据读取

In [None]:
cs_training = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-training.csv")
cs_test = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv")

### 查看变量的数据结构

In [None]:
cs_training.info()

### 查看变量的分位数等信息

In [None]:
fea_fw = cs_training.describe([0.01, 0.10, 0.25, 0.50, 0.75, 0.90, 0.99]).T
fea_fw

### 查看缺失值

In [None]:
cs_training.isnull().mean()

### 这是一个样本不平衡的问题，未来两年内出现信贷问题的用户占绝少数。

In [None]:
sns.countplot('SeriousDlqin2yrs' ,data = cs_training)
plt.show()

In [None]:
#分桶后的特征与标签的关联
def fea_label_association(fea_name):
    fea = cs_training[fea_name]
    isnullrate = sum(fea.isnull()) / fea.shape[0]
    print(fea_name, "null rate:", round(isnullrate, 3))
    fea_cate = pd.qcut(fea, 10, duplicates='drop')
    fea_cate_pd = pd.concat([fea_cate, cs_training['SeriousDlqin2yrs']], axis=1)
    df = fea_cate_pd.groupby(fea_name)["SeriousDlqin2yrs"].agg(['sum','count']).reset_index()
    new_fea_name = '%s_stas'%(fea_name)
    df.loc[:, new_fea_name] = df['sum'] / df['count']
    random_death = sum(cs_training['SeriousDlqin2yrs']) / cs_training.shape[0]
    fig,ax = plt.subplots(figsize=(10,5))

    x = [str(i) for i in (list(df[fea_name].values))]
    y = list(df['%s_stas'%(fea_name)].values)
    x.append('random')
    y.append(random_death)
    ax.barh(x,y,orientation='horizontal')
    #for a,b in zip()
    for i, v in enumerate(y):
        ax.text(v+0.01,i-0.1, np.round(v,3))
    ax.set_xlim(0,max(y)+0.04)
    plt.xlabel('rate')
    plt.ylabel(fea_name)
    plt.title('the assocation between %s and SeriousDlqin2yrs' % (fea_name))
    plt.show()

#未分桶后的特征与标签的关联
def fea_label_association_null_bin(fea_name):
    fea = cs_training[fea_name]
    isnullrate = sum(fea.isnull()) / fea.shape[0]
    print(fea_name, "null rate:", round(isnullrate, 3))
    fea_cate = fea
    fea_cate_pd = pd.concat([fea_cate, cs_training['SeriousDlqin2yrs']], axis=1)
    df = fea_cate_pd.groupby(fea_name)["SeriousDlqin2yrs"].agg(['sum','count']).reset_index()
    new_fea_name = '%s_stas'%(fea_name)
    df.loc[:, new_fea_name] = df['sum'] / df['count']
    random_death = sum(cs_training['SeriousDlqin2yrs']) / cs_training.shape[0]
    fig,ax = plt.subplots(figsize=(10,5))
    x = [str(i) for i in (list(df[fea_name].values))]
    y = list(df['%s_stas'%(fea_name)].values)
    x.append('random')
    y.append(random_death)
    ax.barh(x,y,orientation='horizontal')
    #for a,b in zip()
    for i, v in enumerate(y):
        ax.text(v+0.01,i-0.1, np.round(v,3))
    ax.set_xlim(0,max(y)+0.04)
    plt.xlabel('rate')
    plt.ylabel(fea_name)
    plt.title('the assocation between %s and SeriousDlqin2yrs' % (fea_name))
    plt.show()

### 该特征的含义为RevolvingUtilizationOfUnsecuredLines float (个人信用卡额度+个人信用额度)/个人信用总额度从下图中可以看出该值越大，未来两年经济困难的可能性也越大，该特征和标签成正比的关系，是一个比较好的特征但也看出了有部分用户存在异常值，最大的值已经达到了50708，此时有两种方法处理，一种是分桶，另外一种是去除异常值的用户，或者将异常值填充为99%的分位值，1.092956这样就比较合理

In [None]:
fea_name = 'RevolvingUtilizationOfUnsecuredLines'
fea_label_association(fea_name)

In [None]:
abnormal_num = fea_fw['99%'][fea_name]
# cs_training = cs_training[cs_training[fea_name] <= abnormal_num] #去除异常值
def func(x, abnormal_num):
    if x >=abnormal_num:
        return abnormal_num
    else:
        return x
cs_training[fea_name] = cs_training[fea_name].apply(lambda x:func(x, abnormal_num)) #修改异常值
cs_test[fea_name] = cs_test[fea_name].apply(lambda x:func(x, abnormal_num)) #修改异常值

### Age int 该特征为借贷人年龄，年龄小，承担风险的能力小，年龄大，承担风险的能力也越大，是成反比的关系，是一个比较好的特征

In [None]:
fea_name = 'age'
fea_label_association(fea_name)

In [None]:
plt.figure(figsize=[10, 8])
plt.subplot(221)
sns.boxplot(data=cs_training['age'])
plt.ylabel('age') 
plt.subplot(222)
sns.distplot(cs_training['age'])
plt.xlabel('age')
plt.show()

### NumberOfTime30-59DaysPastDueNotWorse int 近两年内借款人逾期30-59天的次数该特征成正比的关系，近两年内借款人逾期30-59天的次数越少，承担风险的能力就越大

In [None]:
fea_name = 'NumberOfTime30-59DaysPastDueNotWorse'
fea_label_association(fea_name)

### DebtRatio float 负债率=（每月偿还债务+赡养费+生活费用）/ 每月总收入
### 该特征与标签是整体成正比的关系，当负债率越高，接下来两年承担风险的能力就越小，也存在一些异常值，最大值高达了329664

In [None]:
fea_name = 'DebtRatio'
fea_label_association(fea_name)

In [None]:
abnormal_num = fea_fw['99%'][fea_name]
# cs_training = cs_training[cs_training[fea_name] <= abnormal_num] #去除异常值
def func(x, abnormal_num):
    if x >=abnormal_num:
        return abnormal_num
    else:
        return x
cs_training[fea_name] = cs_training[fea_name].apply(lambda x:func(x, abnormal_num)) #修改异常值
cs_test[fea_name] = cs_test[fea_name].apply(lambda x:func(x, abnormal_num)) #修改异常值

### MonthlyIncome float 月收入
### 该特征与标签成反比的关系，收入越低，承担风险能力越低，收入越高，承担风险的能力越高
### 该特征最大值3008750，高收入人群也存在这个可能性，可以分别尝试是否需要去除离群值的效果比对

In [None]:
fea_name = 'MonthlyIncome'
fea_label_association(fea_name)

#月收入存在缺失值，可用中位数填充
#cs_training.MonthlyIncome.fillna(value=cs_training.MonthlyIncome.median(), inplace=True)

### NumberOfOpenCreditLinesAndLoans int 未偿还贷款笔数（诸如车贷或抵押贷款等分期贷款）和个人信用贷（如信用卡）
### 该特征与标签整体成正比的关系，未偿还贷款笔数越少，承担风险能力越高，也存在当贷款笔数为0-3的时候，是因为信贷次数比较少，还缺乏经验，不了解自身承担风险的能力

In [None]:
fea_name = 'NumberOfOpenCreditLinesAndLoans'
fea_label_association(fea_name)

In [None]:
print(cs_training[cs_training['NumberOfOpenCreditLinesAndLoans']==0]['SeriousDlqin2yrs'].mean())
print(cs_training[cs_training['NumberOfOpenCreditLinesAndLoans']==1]['SeriousDlqin2yrs'].mean())
print(cs_training[cs_training['NumberOfOpenCreditLinesAndLoans']==2]['SeriousDlqin2yrs'].mean())

### NumberOfTimes90DaysLate int 借款人逾期90天以上的次数
### 该特征与标签整体成正比的关系，借款人逾期90天以上的次数少，承担风险能力越高

In [None]:
fea_name = 'NumberOfTimes90DaysLate'
fea_label_association_null_bin(fea_name)

### NumberRealEstateLoansOrLines int 抵押贷款和房地产贷款数量，包括房屋净值信贷额度等
### 该特征与标签整体成正比的关系，抵押贷款和房地产贷款数量少，承担风险能力越高

In [None]:
fea_name = 'NumberRealEstateLoansOrLines'
fea_label_association(fea_name)

### NumberOfTime60-89DaysPastDueNotWorse int 借款人逾期 60-89天的次数
### 该特征与标签整体成正比的关系，借款人逾期 60-89天的次数少，承担风险能力越高

In [None]:
fea_name = 'NumberOfTime60-89DaysPastDueNotWorse'
fea_label_association_null_bin(fea_name)

### NumberOfDependents int 借贷人家庭人数
### 该特征与标签整体成正比的关系，借贷人家庭人数少，承担风险能力越高

In [None]:
fea_name = 'NumberOfDependents'
fea_label_association_null_bin(fea_name)

#由于借贷人家庭人数存在缺失值，可用中位数填充
# cs_training.NumberOfDependents.fillna(value=cs_training.NumberOfDependents.median(), inplace=True)

### 衍生变量

In [None]:
cs_training['AllNumlate']=cs_training['NumberOfTime30-59DaysPastDueNotWorse']+cs_training['NumberOfTime60-89DaysPastDueNotWorse']+cs_training['NumberOfTimes90DaysLate']
cs_training['Monthlypayment']=cs_training['DebtRatio']*cs_training['MonthlyIncome']

cs_test['AllNumlate']=cs_test['NumberOfTime30-59DaysPastDueNotWorse']+cs_test['NumberOfTime60-89DaysPastDueNotWorse']+cs_test['NumberOfTimes90DaysLate']
cs_test['Monthlypayment']=cs_test['DebtRatio']*cs_test['MonthlyIncome']

# 3.模型构建及验证

In [None]:
for i in cs_training.columns:
    print(i,cs_training[i].dtype)
columns = [col for col in cs_training.columns if col!='Unnamed: 0' and col != 'SeriousDlqin2yrs']
print(columns)

### 选择LGBMClassifier，速度快，准确率高，可解释性好

In [None]:
X = cs_training[columns]
Y = cs_training['SeriousDlqin2yrs']
clf = lgb.LGBMClassifier(n_estimators=300,colsample_bytree=0.95,subsample=0.95, learning_rate=0.05)
print(clf)

### 模型验证，选择最好的迭代次数

In [None]:
train_auc_list = []
test_auc_list = []
best_iter_list = []
kf = KFold(n_splits=5,shuffle=True, random_state=2021)
fold_index = 0
for train_index, test_index in kf.split(X):
    print("*"*10,fold_index,"*"*10)
    X_train,X_test,Y_train,Y_test = X.loc[train_index], X.loc[test_index], Y.loc[train_index], Y.loc[test_index]
    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    clf.fit(X_train,Y_train,
           eval_set=[(X_train, Y_train), (X_test, Y_test)],
           verbose=100,
           early_stopping_rounds=35)
    print("best_iter:",clf.best_iteration_)
    train_pre = clf.predict_proba(X_train)[:,1]
    test_pre = clf.predict_proba(X_test)[:,1]
    train_auc = roc_auc_score(Y_train, train_pre)
    test_auc = roc_auc_score(Y_test, test_pre)
    print("train_auc:%.4f test_auc:%.4f" % (train_auc, test_auc))
    best_iter_list.append(clf.best_iteration_)
    test_auc_list.append(test_auc)
    fold_index += 1
    col_importance = pd.DataFrame({'columns':columns,'importance':clf.feature_importances_}).sort_values(by='importance', ascending=False)
    print(col_importance)

### 模型训练好之后，还能看到特征重要性排名，排名靠前的特征主要是age,RevolvingUtilizationOfUnsecuredLines,MonthlyIncome,DebtRatio等，与前文数据分析，特征与标签的关系非常符合

In [None]:
print(np.round(test_auc_list,4), np.round(np.mean(test_auc_list),4))
print(best_iter_list, np.mean(best_iter_list)) #0.8657

# 4.结果提交

In [None]:
clf = lgb.LGBMClassifier(n_estimators=int(np.mean(best_iter_list)),colsample_bytree=0.95,subsample=0.95, learning_rate=0.05)
clf.fit(X,Y)
test_pre = clf.predict_proba(cs_test[columns])[:,1]

In [None]:
res = pd.DataFrame({'Id':cs_test['Unnamed: 0'],'Probability':test_pre})
res.to_csv("submission.csv", index=None)