### Imputation Methods
---

- [ ] WOE Imputer

In [3]:
# !pip install sidetable

In [13]:
import sys, os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sidetable as stb
import lightgbm as lgb

sys.path.insert(1, "../..")

import ml4risk.data_preparation.imputer as ip

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Example Data

In [3]:
config_path = "../../data/pl-gen-4/config.json"
debug = 1000

with open(os.path.join(config_path), "r") as f:
    config = json.load(f)

prefix_in = "transformed"
train_path = config["data"][prefix_in]["xf_train_df"]
valid_path = config["data"][prefix_in]["xf_valid_df"]
test_path = config["data"][prefix_in]["xf_test_df"]
target_col = config["data_columns"]["target_cols"][0]

features = ['p13_alj0316', 'p13_alj0300', 'p13_aua2814', 
            'p13_iln2176', 'p13_iln5047', 'p13_iln5747',
            'trended3d_tamp2701', 'trended3d_tamp3701', 'trended3d_taut0901',
            'trended3d_taut0905', 'trended3d_tbca2607', 'trended3d_tbca2608',
            'trended3d_tbca2609', 'trended3d_tbca2610', 'trended3d_tbca2612']

In [4]:
train_df = pd.read_parquet(train_path, columns=features + [target_col])
train_df.shape

(116290, 16)

In [5]:
train_df.head()

Unnamed: 0,p13_alj0316,p13_alj0300,p13_aua2814,p13_iln2176,p13_iln5047,p13_iln5747,trended3d_tamp2701,trended3d_tamp3701,trended3d_taut0901,trended3d_taut0905,trended3d_tbca2607,trended3d_tbca2608,trended3d_tbca2609,trended3d_tbca2610,trended3d_tbca2612,target_dev
1084,1.0,5.0,0.0,0.0,0.0,0.0,2284.0,2316.0,0.0,0.0,12393.0,22.0,11.0,69.0,12383.0,0.0
1091,4.0,17.0,0.0,0.0,0.0,0.0,895.0,902.0,0.0,0.0,11298.0,350.0,23.0,99.0,11112.0,0.0
1094,2.0,5.0,0.0,0.0,0.0,0.0,396.0,396.0,0.0,0.0,6487.0,86.0,54.0,53.0,6521.0,0.0
1109,0.0,5.0,0.0,0.0,0.0,0.0,1483.0,1512.0,0.0,0.0,22466.0,251.0,98.0,82.0,22344.0,0.0
1119,1.0,4.0,0.0,0.0,0.0,0.0,1152.0,1176.0,0.0,0.0,10942.0,209.0,72.0,86.0,11045.0,0.0


### Impute Missing

In [6]:
train_df.stb.missing()

Unnamed: 0,missing,total,percent
p13_aua2814,22117,116290,19.018832
trended3d_taut0901,15326,116290,13.179121
trended3d_taut0905,15326,116290,13.179121
trended3d_tamp2701,10456,116290,8.991315
trended3d_tamp3701,10428,116290,8.967237
p13_iln5047,8668,116290,7.453779
p13_iln5747,8668,116290,7.453779
p13_iln2176,8211,116290,7.060796
trended3d_tbca2607,2660,116290,2.287385
trended3d_tbca2608,2660,116290,2.287385


### Basic Usages

In [24]:
# closest_boundary
cb_woe_imputer = ip.WOEImputer(impute_method="closest_boundary")
cb_woe_imputer.fit(train_df[features], train_df[target_col])
transformed_df_cb = cb_woe_imputer.transform(train_df[features])


# midpoint default
mp_woe_imputer = ip.WOEImputer(impute_method="midpoint")
mp_woe_imputer.fit(train_df[features], train_df[target_col])
transformed_df_mp = mp_woe_imputer.transform(train_df[features])


processed  15  num attributes



100%|██████████| 15/15 [00:00<00:00, 694.31it/s]
100%|██████████| 15/15 [00:00<00:00, 2598.06it/s]


processed  15  num attributes



100%|██████████| 15/15 [00:00<00:00, 604.29it/s]
100%|██████████| 15/15 [00:00<00:00, 2774.13it/s]


In [26]:
train_df[features].shape, transformed_df_cb.shape, transformed_df_mp.shape

((116290, 15), (116290, 15), (116290, 15))

### Saving and Loading

In [30]:
# save for future use
os.makedirs("artifacts", exist_ok=True)
cb_woe_imputer.save_state_dict("artifacts/impute_values.pkl")

In [31]:
# load
cb_woe_imputer2 = WOEImputer(impute_method="closest_boundary",
                            state_dict_path="artifacts/impute_values.pkl")
transformed_df_cb2 = cb_woe_imputer2.transform(train_df[features])

100%|██████████| 15/15 [00:00<00:00, 2268.01it/s]


In [32]:
# make sure the two imputers generate the same result
transformed_df_cb2.equals(transformed_df_cb)

True

### Verify the results
---

#### Missing percentages

In [33]:
train_df.stb.missing()

Unnamed: 0,missing,total,percent
p13_aua2814,22117,116290,19.018832
trended3d_taut0901,15326,116290,13.179121
trended3d_taut0905,15326,116290,13.179121
trended3d_tamp2701,10456,116290,8.991315
trended3d_tamp3701,10428,116290,8.967237
p13_iln5047,8668,116290,7.453779
p13_iln5747,8668,116290,7.453779
p13_iln2176,8211,116290,7.060796
trended3d_tbca2607,2660,116290,2.287385
trended3d_tbca2608,2660,116290,2.287385


In [34]:
transformed_df_cb2.stb.missing()

Unnamed: 0,missing,total,percent
p13_alj0316,0,116290,0.0
p13_alj0300,0,116290,0.0
p13_aua2814,0,116290,0.0
p13_iln2176,0,116290,0.0
p13_iln5047,0,116290,0.0
p13_iln5747,0,116290,0.0
trended3d_tamp2701,0,116290,0.0
trended3d_tamp3701,0,116290,0.0
trended3d_taut0901,0,116290,0.0
trended3d_taut0905,0,116290,0.0


#### Eye-ball a few examples.

In [38]:
ft = "trended3d_tbca2609"

display(cb_woe_imputer.woe_dict[ft])
display(f"Imputed value should be - closet_boundary(bin_0): {cb_woe_imputer.impute_values_[ft]} = {98771.0}")
display(f"Imputed value should be - mid-point(bin_0): {mp_woe_imputer.impute_values_[ft]} = {(976.0+98771.0)/2}")

Unnamed: 0,%accts,min,max,woe
0,10.61%,0.0,30.0,-0.1941
1,38.97%,31.0,338.0,-0.0107
2,23.32%,339.0,975.0,0.0164
3,24.81%,976.0,98771.0,0.0241
missing,2.29%,,,0.4706


'Imputed value should be - closet_boundary(bin_0): 98771.0 = 98771.0'

'Imputed value should be - mid-point(bin_0): 49873.5 = 49873.5'

In [39]:
ft = "trended3d_tbca2612"

display(cb_woe_imputer.woe_dict[ft])
display(f"Imputed value should be - closet_boundary(bin_0): {cb_woe_imputer.impute_values_[ft]} = {0}")
display(f"Imputed value should be - mid-point(bin_0): {mp_woe_imputer.impute_values_[ft]} = {(0+4094.0)/2}")

Unnamed: 0,%accts,min,max,woe
0,20.74%,0.0,4094.0,0.1968
1,9.54%,4095.0,5887.0,0.1724
2,6.35%,5888.0,7049.0,0.1219
3,8.32%,7050.0,8655.0,0.0919
4,7.11%,8656.0,10024.0,-0.0133
5,6.21%,10025.0,11525.0,-0.1283
6,21.47%,11526.0,18760.0,-0.1766
7,3.55%,18761.0,20491.0,-0.2485
8,14.43%,20492.0,261926.0,-0.3165
missing,2.29%,,,0.4706


'Imputed value should be - closet_boundary(bin_0): 0.0 = 0'

'Imputed value should be - mid-point(bin_0): 2047.0 = 2047.0'