In [1]:
import os, sys
from pathlib import Path

BASE_DIR = Path(Path.home(), "workspace", "services", "credit_model")
DATA_DIR = Path(Path.home(), "workspace", "data")
if BASE_DIR not in sys.path:
    sys.path.insert(0, f"{BASE_DIR}")

In [2]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import toad

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from util import woe_helper
from util import woe_mono
from util import scorebin_helper
from util import report_helper

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)

# 加载数据

In [6]:
fp_data = Path(DATA_DIR, 'tutorial', 'data.csv')
df_data = pd.read_csv(fp_data, index_col=None)

In [7]:
num_cols = ['Collateral_valuation', 'Age', 'Properties_Total', 'Amount', 'Term', 'Historic_Loans', 'Current_Loans', 'Max_Arrears']
cat_cols = ['Region', 'Area', 'Activity', 'Guarantor', 'Collateral', 'Properties_Status']
features = num_cols + cat_cols
label = 'Defaulter'

In [8]:
for c in cat_cols:
    df_data.loc[:, c] = df_data.loc[:, c].apply(lambda x: str(f"cat_{x}"))

# 获得分箱和编码

In [9]:
woe = woe_helper.WOE()
woe.fit(df_data, label, exclude=['AppNo', label], method='dt')

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 54.80it/s]


finish combiner fit
finish combiner transform


100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 59.20it/s]

finish WOE fit





In [10]:
df_woe = woe.transform(df_data)
df_bin = woe.transform(df_data, bin_only=True)

100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 111.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 124.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 118.02it/s]


# 特征统计报表和区分度报表

In [11]:
df_report = report_helper.FTReport.get_report(df_bin, features, label)

In [12]:
df_eval = report_helper.FTReport.eval_metrics(df_woe, features, label)

In [13]:
df_combined_report = df_report.merge(df_eval, on='feature', how='left')
df_combined_report.head(100)

Unnamed: 0,feature,bin,total_pct,total,bad,bad_rate,iv,auc,ks
0,Collateral_valuation,00.[-inf ~ 2057.0),0.15534,7767.0,657.0,0.084589,0.157705,0.602955,0.1611
1,Collateral_valuation,01.[2057.0 ~ 4571.0),0.1719,8595.0,988.0,0.114951,0.157705,0.602955,0.1611
2,Collateral_valuation,02.[4571.0 ~ 8324.5),0.11508,5754.0,884.0,0.153632,0.157705,0.602955,0.1611
3,Collateral_valuation,03.[8324.5 ~ 13823.5),0.06604,3302.0,604.0,0.182919,0.157705,0.602955,0.1611
4,Collateral_valuation,04.[13823.5 ~ inf),0.0609,3045.0,891.0,0.292611,0.157705,0.602955,0.1611
5,Collateral_valuation,05.nan,0.43074,21537.0,4195.0,0.194781,0.157705,0.602955,0.1611
6,,,,,,,,,
7,Age,00.[-inf ~ 27.5),0.1046,5230.0,1316.0,0.251625,0.096882,0.58667,0.133332
8,Age,01.[27.5 ~ 34.5),0.18576,9288.0,1940.0,0.208872,0.096882,0.58667,0.133332
9,Age,02.[34.5 ~ 41.5),0.2174,10870.0,1833.0,0.168629,0.096882,0.58667,0.133332
