In [1]:
import os, sys
from pathlib import Path

BASE_DIR = Path(Path.home(), "workspace", "services", "credit_model")
DATA_DIR = Path(Path.home(), "workspace", "data")
if BASE_DIR not in sys.path:
    sys.path.insert(0, f"{BASE_DIR}")

In [2]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import toad

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from util import woe_helper
from util import woe_mono
from util import stable_selection

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [6]:
fp_data = Path(DATA_DIR, 'tutorial', 'data.csv')
df_data = pd.read_csv(fp_data, index_col=None)

In [7]:
num_cols = ['Collateral_valuation', 'Age', 'Properties_Total', 'Amount', 'Term', 'Historic_Loans', 'Current_Loans', 'Max_Arrears']
cat_cols = ['Region', 'Area', 'Activity', 'Guarantor', 'Collateral', 'Properties_Status']
features = num_cols + cat_cols
label = 'Defaulter'

In [8]:
for c in cat_cols:
    df_data.loc[:, c] = df_data.loc[:, c].apply(lambda x: str(f"cat_{x}"))

In [9]:
woe = woe_helper.WOE()
woe.fit(df_data, label, exclude=['AppNo', label], method='dt')

100%|██████████| 14/14 [00:00<00:00, 54.57it/s]


finish combiner fit
finish combiner transform


100%|██████████| 14/14 [00:00<00:00, 55.59it/s]

finish WOE fit





In [10]:
df_woe = woe.transform(df_data)

100%|██████████| 14/14 [00:00<00:00, 110.92it/s]
100%|██████████| 14/14 [00:00<00:00, 111.38it/s]


In [11]:
np.logspace(-5, 2, 10)

array([1.00000000e-05, 5.99484250e-05, 3.59381366e-04, 2.15443469e-03,
       1.29154967e-02, 7.74263683e-02, 4.64158883e-01, 2.78255940e+00,
       1.66810054e+01, 1.00000000e+02])

# 稳定性筛选

In [12]:
selector = stable_selection.select(df_woe[features], df_woe[label])

In [13]:
df_fs_stable = pd.DataFrame({
    'feature': features,
    'stable_select': selector.get_support(),
    'stable_score': selector.stability_scores_.max(axis=1)})
df_fs_stable

Unnamed: 0,feature,stable_select,stable_score
0,Collateral_valuation,True,1.0
1,Age,True,1.0
2,Properties_Total,True,1.0
3,Amount,True,1.0
4,Term,True,1.0
5,Historic_Loans,True,1.0
6,Current_Loans,True,1.0
7,Max_Arrears,True,1.0
8,Region,True,1.0
9,Area,True,1.0


# VIF筛选

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [15]:
lst_item = list()
for k, c in enumerate(features):
    vif = variance_inflation_factor(df_woe[features], k)
    item = {'feature': c, 'vif_score': vif}
    lst_item.append(item)

In [16]:
df_fs_vif = pd.DataFrame(lst_item)
df_fs_vif

Unnamed: 0,feature,vif_score
0,Collateral_valuation,2.398628
1,Age,1.159514
2,Properties_Total,1.059383
3,Amount,1.707269
4,Term,1.263871
5,Historic_Loans,1.169254
6,Current_Loans,1.066556
7,Max_Arrears,1.020378
8,Region,1.690797
9,Area,1.229582


# 前后向筛选

In [17]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [18]:
estimator = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
selector = RFE(estimator)
selector = selector.fit(df_woe[features], df_woe[label])

In [19]:
df_fs_rfe = pd.DataFrame({'feature' : features, "rfe_select" : selector.support_})
df_fs_rfe

Unnamed: 0,feature,rfe_select
0,Collateral_valuation,False
1,Age,True
2,Properties_Total,True
3,Amount,True
4,Term,True
5,Historic_Loans,True
6,Current_Loans,False
7,Max_Arrears,False
8,Region,True
9,Area,False


In [20]:
from boruta import BorutaPy

In [21]:
estimator = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
selector = BorutaPy(estimator, n_estimators='auto', random_state=1, max_iter=10)
selector = selector.fit(df_woe[features].values, df_woe[label].values)

In [22]:
df_fs_bruta = pd.DataFrame({'feature' : features, "bruta_select" : selector.support_})
df_fs_bruta

Unnamed: 0,feature,bruta_select
0,Collateral_valuation,True
1,Age,True
2,Properties_Total,True
3,Amount,True
4,Term,True
5,Historic_Loans,True
6,Current_Loans,True
7,Max_Arrears,True
8,Region,True
9,Area,True


# 最大相关性最小冗余筛选

In [23]:
from skfeature.function.information_theoretical_based import MRMR

In [24]:
rank = MRMR.mrmr(df_woe[features].values, df_woe[label].values)

In [25]:
df_fs_mrmr = pd.DataFrame({'feature' : features, "mrmr_select" : rank})
df_fs_mrmr.sort_values(by='mrmr_select')

Unnamed: 0,feature,mrmr_select
9,Area,0
13,Properties_Status,1
4,Term,2
8,Region,3
11,Guarantor,4
0,Collateral_valuation,5
5,Historic_Loans,6
7,Max_Arrears,7
10,Activity,8
6,Current_Loans,9


# 结合所有方法

In [26]:
df_fs = None
for df in [df_fs_stable, df_fs_vif, df_fs_rfe, df_fs_bruta, df_fs_mrmr]:
    if df_fs is None:
        df_fs = df
    else:
        df_fs = df_fs.merge(df, on='feature', how='left')
df_fs

Unnamed: 0,feature,stable_select,stable_score,vif_score,rfe_select,bruta_select,mrmr_select
0,Collateral_valuation,True,1.0,2.398628,False,True,5
1,Age,True,1.0,1.159514,True,True,11
2,Properties_Total,True,1.0,1.059383,True,True,12
3,Amount,True,1.0,1.707269,True,True,10
4,Term,True,1.0,1.263871,True,True,2
5,Historic_Loans,True,1.0,1.169254,True,True,6
6,Current_Loans,True,1.0,1.066556,False,True,9
7,Max_Arrears,True,1.0,1.020378,False,True,7
8,Region,True,1.0,1.690797,True,True,3
9,Area,True,1.0,1.229582,False,True,0
