# 风控比赛

In [5]:
import pandas as pd
import catboost as cat
from sklearn.metrics import *
from sklearn.preprocessing import *

# 系统库
import os, sys

# 自带数据
datalib_path = os.path.join(os.path.abspath('.'), '../../')
sys.path.append(datalib_path)
import dataset

## Submission评估

In [6]:
# 答案评估函数，和比赛相同
def score_answer(y_pred: pd.Series):
    y_true = pd.read_csv(os.path.join(dataset.creditrisk_path,'answer.csv'), index_col=0)
    # index 对齐
    assert (y_true.index == y_pred.index).all()
    public_index = y_true.index[0:3000]
    private_index = y_true.index[3000:]
    public_score = roc_auc_score(
        y_true=y_true.loc[public_index].target,
        y_score=y_pred.loc[public_index].target
    )
    private_score = roc_auc_score(
        y_true=y_true.loc[private_index].target,
        y_score=y_pred.loc[private_index].target
    )
    print('Public Score:', public_score, 'Private Score:', private_score)

# 测试
y_true = pd.read_csv(os.path.join(dataset.creditrisk_path,'answer.csv'), index_col=0)
score_answer(y_true)

Public Score: 1.0 Private Score: 1.0


## 加载数据

In [7]:
train = pd.read_csv(os.path.join(dataset.creditrisk_path,'train.csv')).set_index('ID')
test = pd.read_csv(os.path.join(dataset.creditrisk_path,'test.csv')).set_index('ID')
print(train.shape, test.shape)
print(train.head())
print(test.head())

(15000, 24) (6000, 23)
    target  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_1  PAY_2  PAY_3  \
ID                                                                          
0        0      50000    1          2         1   34      0      0      0   
1        0      10000    1          2         2   32      2      0      0   
2        0     110000    2          2         2   33      0      0      0   
3        0      80000    1          1         2   24      0      0      0   
4        0     450000    2          2         1   36     -1     -1     -1   

    PAY_4  ...  BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  \
ID         ...                                                         
0       0  ...      30609      30432      30254      30088      2008   
1       0  ...       9028       9644       9790       9990      1132   
2       0  ...      41240      42380        380      96736      4000   
3       0  ...      78589      78806      50663      50214      4400   
4    

## Baseline模型

In [8]:
# 利用catboost的分析能力，快速建立baseline
features = [col for col in train.columns if col not in ['ID', 'target']]

model = cat.CatBoostClassifier(random_state=123) # 固定random_state确保每次提交的结果相同
model.fit(train[features], train['target'], verbose=False)
y_pred = model.predict_proba(test[features])[:, 1]

submission = test[[]].copy() # 用copy防止奇怪的内存错误
submission['target'] = y_pred
submission.head()

Unnamed: 0_level_0,target
ID,Unnamed: 1_level_1
15000,0.040305
15001,0.279795
15002,0.132787
15003,0.276708
15004,0.206277


In [9]:
# 评估Baseline
score_answer(submission)

Public Score: 0.7741884705191268 Private Score: 0.781436480165474


## 特征工程

### 合并训练/数据集

In [10]:
print(train.columns)
train['is_train'] = 1
test['is_train'] = 0
data = pd.concat([train,test],axis=0, sort=False)
assert data.shape[0] == (train.shape[0] + test.shape[0])
assert data.shape[1] == train.shape[1]
data.head()

Index(['target', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')


Unnamed: 0_level_0,target,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,is_train
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,50000,1,2,1,34,0,0,0,0,...,30432,30254,30088,2008,2000,1500,1501,2105,2219,1
1,0.0,10000,1,2,2,32,2,0,0,0,...,9644,9790,9990,1132,1384,1000,196,200,0,1
2,0.0,110000,2,2,2,33,0,0,0,0,...,42380,380,96736,4000,2760,3000,380,96736,3600,1
3,0.0,80000,1,1,2,24,0,0,0,0,...,78806,50663,50214,4400,50000,3034,1800,1817,1750,1
4,0.0,450000,2,2,1,36,-1,-1,-1,-1,...,9532,3420,130,2081,198,9532,3420,130,878,1


### 数值特征float化

In [11]:
# 因为后续使用GBDT，所以没有必要做归一化等处理
num_features = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'AGE']
data[num_features] = data[num_features].astype(float)

# 借款相对额度（LIMIT_BAL）
for col in ['BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']:
    data[col+'_RATIO'] = data[col]/(data['LIMIT_BAL']+1.0)
    num_features.append(col+'_RATIO')

# 订单相对额度
for i in range(1,7):
    data[f'BILL_PAY_AMT_{i}_RATIO'] = data[f'PAY_AMT{i}']/(data[f'BILL_AMT{i}']+1.0)
    num_features.append(f'BILL_PAY_AMT_{i}_RATIO')

### 类型特征Count Encode

In [12]:
# 类型特征
cat_features = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
data[cat_features] = data[cat_features].astype(str)

from category_encoders import CountEncoder
for col in ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    data[col+'_cnt'] = CountEncoder().fit_transform(data[col])
    cat_features.append(col+'_cnt')

# 按时还款
for col in ['PAY_1','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    data[col+'_ontime'] = ~data[col].isin(['0','-1','-2'])
    num_features.append(col+'_ontime')

## 训练模型

### 简单CatBoost

In [13]:
features = cat_features + num_features

X_train,y_train = data[data.is_train==1][features], data[data.is_train==1]['target']
X_test = data[data.is_train==0][features]

model = cat.CatBoostClassifier(random_state=123, cat_features=cat_features) # 固定random_state确保每次提交的结果相同
model.fit(X_train, y_train, verbose=False)
y_pred = model.predict_proba(X_test)[:, 1]

submission = test[[]].copy() # 用copy防止奇怪的内存错误
submission['target'] = y_pred
submission.head()

Unnamed: 0_level_0,target
ID,Unnamed: 1_level_1
15000,0.036207
15001,0.374969
15002,0.128684
15003,0.310621
15004,0.167347


#### 评估模型

In [14]:
score_answer(submission)

Public Score: 0.7738070937361423 Private Score: 0.7789971025017056


### CatBoost through KFold

In [15]:
features = cat_features + num_features

X_train, y_train = data[data.is_train==1][features], data[data.is_train==1]['target']
X_test = data[data.is_train==0][features]

from sklearn.model_selection import KFold

predictions = test[[]].copy()
i = 0
for train_idx, test_idx in KFold(6, shuffle=True, random_state=10).split(X_train):
    X_train1, X_val1 = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train1, y_val1 = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model = cat.CatBoostClassifier(
        random_state=123, 
        cat_features=cat_features,
        eval_metric='AUC'
    ) # 固定random_state确保每次提交的结果相同
    model.fit(
        X_train1, y_train1, 
        verbose=False, 
        early_stopping_rounds=10, 
        eval_set=[(X_val1, y_val1)],
    )
    y_pred = model.predict_proba(X_test)[:, 1]
    predictions[f'target_{i}'] = y_pred
    i+=1

predictions.head()

Unnamed: 0_level_0,target_0,target_1,target_2,target_3,target_4,target_5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15000,0.059558,0.051062,0.16324,0.072042,0.077706,0.072344
15001,0.362264,0.331987,0.389154,0.31197,0.342273,0.324608
15002,0.160803,0.157264,0.230637,0.170284,0.163638,0.19123
15003,0.388852,0.375552,0.340432,0.347471,0.419194,0.326624
15004,0.142489,0.144916,0.205683,0.132265,0.142552,0.171967


#### 准备submission

In [16]:
submission = test[[]].copy()
submission['target'] = predictions.mean(axis=1).rank()/predictions.shape[0]
submission.head()

Unnamed: 0_level_0,target
ID,Unnamed: 1_level_1
15000,0.060167
15001,0.8295
15002,0.578667
15003,0.846
15004,0.478167


#### 评估模型

In [17]:
score_answer(submission)

Public Score: 0.7824574998448257 Private Score: 0.7843634712042674
