In [91]:
import sys, json, os, ast
import copy
import numpy as np
import pandas as pd
from smart_open import open
from tqdm import tqdm
import pickle as pkl

sys.path.insert(1, "../..")
from src.logger import make_logger
from src.dataloader import TabularDataloader
from src.Trainer import LGBMTrainer, TFTrainer
from src.preprocess import Preprocess

from rdsutils.feature_selection import mrmr
from rdsutils.woe import WOE_Transform
from _utils.feature_selection import feature_selection as fs
from _utils.performance_eval import performance_eval_v3 as p_eval
from rdsutils.feature_selection import FeatureSelector as general_purpose_fsel
from src.feature_selection import FeatureSelector  # to be moved to rdsutils

# new modules
from _utils.sample_weights import get_sample_weight

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
target = 'target_v1'
target_indeterminate = 'indeterminate_v1'
weight = "weight"
seed = 42

with open("config.json", "r") as f:
    config = json.load(f)
    
display(config.keys()) 

gen3_features = config["data_columns"]["gen3_features"]
gen3_params = config["model_params"]["gen3_params"]
if "scale_pos_weight" in gen3_params:
    del gen3_params["scale_pos_weight"]

bureau_fts = config["data_columns"]["bureau_features_cols"] 
cat_fts = ['t11_t3d_segid', 't11_t3d_segid_supp'] # config["data_columns"]["cat_cols"] 
prescreen_fts = bureau_fts + cat_fts

dict_keys(['data', 'meta', 'data_columns', 'model_params', 'model_features', 'impute_vals', 'monotone'])

In [4]:
display(config["data"]["clean"].keys())

dict_keys(['all_features_dev1', 'all_features_dev2', 'all_features_oot1', 'all_features_oot2', 'subset_dev1', 'subset_dev2'])

#### load data and features

In [22]:
%%time
# data dict
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])

# fsel data - sampled
dl = TabularDataloader(train_path=config["data"]["clean"]["subset_dev1"])
dl.load_data(debug_size=10000, random_state=seed)

debug_df, _, _ = dl.get_data(debug=True)
train_df, _, _ = dl.get_data(debug=False)
train_df.shape, debug_df.shape

CPU times: user 34.4 s, sys: 1min 19s, total: 1min 53s
Wall time: 40 s


((228188, 5131), (10000, 5131))

#### get sample weights

In [24]:
col = "ri_source"
weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}

assert sorted(train_df[col].unique().tolist()) == sorted(list(weights.keys()))

pp = Preprocess(exp_dict)

train_df["weight_eval"] = train_df["weight_cob"] * train_df["weight_ri_v1"]
%time train_df = pp.transform(train_df, prescreen_fts, weights, drop_indeterminate=target_indeterminate, existing_weights_col="weight_eval")

100%|██████████| 4205/4205 [00:10<00:00, 414.87it/s]



        added columns:
            weight: training sample_weight scaled using provided weights by ri_source
                weight_cob * weight_ri_v1 * weight_sample
            weight_eval: weight_cob * weight_ri_v1
                used for evaluation purposes
        
dropping indeterminate col: indeterminate_v1
CPU times: user 14.9 s, sys: 4.94 s, total: 19.8 s
Wall time: 19 s


In [25]:
# look at weights
display(train_df[["weight", "ri_source"]].groupby("ri_source")["weight"].sum())
display(train_df[["weight_eval", "ri_source"]].groupby("ri_source")["weight_eval"].sum())

ri_source
booked    15610.0
others    24218.0
proxy     18196.0
Name: weight, dtype: float64

ri_source
booked    15610.0
others    96872.0
proxy     18196.0
Name: weight_eval, dtype: float64

#### feature selection

In [None]:
%%time
# %%capture record

nr_to_consider = 200
nr_to_select = 50
fsel_dir = "./artifacts/dev1_fsel_1"

fsel = FeatureSelector(train_df, data_dict=exp_dict)
rankings = fsel.run(prescreen_fts, target, weight, nr_to_consider, nr_to_select,
                    output_dir=fsel_dir, filter_by_logic_expn=True)

# with open("./artifacts/dev1_fsel_1/log.txt", "w") as f:
#     f.write(record.stdout)

target_col: target_v1
weight_col: weight
Preprocessing... generating iv and shaps
prepping woe...
[32mAttrs removed--missing pct>99%:  [0m ['p13_all8162', 'p13_all8163', 'p13_all8380', 'p13_all8723', 'p13_all9222', 'p13_all9223', 'p13_all9230', 'p13_all9239', 'p13_all9240', 'p13_all9249', 'p13_all9260', 'p13_all9280', 'p13_aua8162', 'p13_aua8163', 'p13_bca0401', 'p13_bca5021', 'p13_bca6201', 'p13_col8194', 'p13_hlc5021', 'p13_hlc7117', 'p13_iln0403', 'p13_mtf8169', 'p13_mtf8656', 'p13_mts8151', 'p13_rpm5020', 'p13_rpm5320', 'p13_rpm5820', 'p13_rpm6160', 'p13_rpm7110', 'p13_rti5020', 'p13_rti5320', 'p13_rti5820', 'p13_uti5030', 'p13_uti5530', 'p13_uti8151', 't11_tall1412', 't11_tall1413', 't11_tall2412', 't11_tcol2556', 't11_tcol2567', 't11_tcol3581', 't11_tmti0451', 't11_tmti0452', 't11_tmti0453', 't11_tmti0454', 't11_tmti0455', 't11_tmti0456', 't11_tmti0457', 't11_tmti0458', 't11_tstu0909']
processed  4155  num attributes



100%|██████████| 7/7 [00:01<00:00,  5.07it/s]


prepping lgbm shap


100%|██████████| 7/7 [00:01<00:00,  4.72it/s]


prepping lgbm mc shap
p13_all9130 1 no monotonic direction - probably should filter out
p13_all9134 1 no monotonic direction - probably should filter out
p13_all9135 1 no monotonic direction - probably should filter out
p13_all9138 1 no monotonic direction - probably should filter out
p13_all9139 1 no monotonic direction - probably should filter out
p13_all9140 1 no monotonic direction - probably should filter out
p13_all9141 1 no monotonic direction - probably should filter out
p13_all9144 1 no monotonic direction - probably should filter out
p13_all9145 1 no monotonic direction - probably should filter out
p13_all9148 1 no monotonic direction - probably should filter out
p13_all9149 1 no monotonic direction - probably should filter out
p13_all9171 1 no monotonic direction - probably should filter out
p13_all9177 1 no monotonic direction - probably should filter out
p13_all9178 1 no monotonic direction - probably should filter out
p13_all9180 1 no monotonic direction - probably should

100%|██████████| 7/7 [00:01<00:00,  4.94it/s]


filtering features by logic - experian
dropping 530 features : kept 3675 features
    reason:  not AA
160 features with greater than                 0.95 missing values
dropping 160 features : kept 3515 features
    reason:  too many missing
dropping 647 features : kept 2868 features
    reason:  low_iv
running many to few


100%|██████████| 200/200 [09:27<00:00,  2.84s/it]
100%|██████████| 200/200 [11:20<00:00,  3.40s/it]
100%|██████████| 200/200 [22:44<00:00,  6.82s/it]
100%|██████████| 7/7 [00:01<00:00,  4.93it/s]


saving ranking.csv
running fsel on few
p13_all9130 1 no monotonic direction - probably should filter out
p13_all9134 1 no monotonic direction - probably should filter out
p13_all9135 1 no monotonic direction - probably should filter out
p13_all9138 1 no monotonic direction - probably should filter out
p13_all9139 1 no monotonic direction - probably should filter out
p13_all9140 1 no monotonic direction - probably should filter out
p13_all9141 1 no monotonic direction - probably should filter out
p13_all9144 1 no monotonic direction - probably should filter out
p13_all9145 1 no monotonic direction - probably should filter out
p13_all9148 1 no monotonic direction - probably should filter out
p13_all9149 1 no monotonic direction - probably should filter out
p13_all9171 1 no monotonic direction - probably should filter out
p13_all9177 1 no monotonic direction - probably should filter out
p13_all9178 1 no monotonic direction - probably should filter out
p13_all9180 1 no monotonic direction 

100%|██████████| 7/7 [00:00<00:00,  8.98it/s]

saving ranking.csv
CPU times: user 1h 42min 43s, sys: 19min 58s, total: 2h 2min 41s
Wall time: 1h 3min 1s





##### get fsel results

* run `fsel.get_rankings(True)` to get ranking_df of features that is ever selected.


##### if computation already made, we can just load it

```python
# initialte project and load back
fsel = FeatureSelector(train_df, data_dict=exp_dict)
fsel.load_state_dict(fsel_dir)
```

##### this is the logic underneath fsel.run

```python
# setup
features = prescreen_fts
target_col = target
weight_col = weight
output_dir = fsel_dir
corr_threshold = 0.8
filter_by_logic_expn = True

# first preprocessing
fsel.preprocess(features, target_col, weight_col, output_dir=output_dir)

if filter_by_logic_expn:
    print("filtering features by logic - experian")
    features = fsel.filter_by_logic_expn(features, target_col, weight_col)

fsel.many_to_few(features, target_col, weight_col, nr_to_consider)
if output_dir: fsel.save_state_dict(output_dir)

# get top <nr_to_select> features by mean just as a rule of a thumb
rankings_imp = fsel.get_rankings(True)
rankings_imp["<mean>"] = rankings_imp.mean(axis=1)
rankings_imp.sort_values("<mean>", inplace=True)
top_features = rankings_imp.index.to_list()
rankings_imp.drop("<mean>", axis=1, inplace=True)

# to approximate number of features to consider so
# we end up nr_to_select features when using the less efficient 
# methods

approx_nr_to_select = int(nr_to_select / (corr_threshold+0.001))

fsel.fsel_on_few(top_features[:approx_nr_to_select], target_col, 
                 weight_col, corr_threshold=corr_threshold)
if output_dir: fsel.save_state_dict(output_dir)

rankings = fsel.get_rankings(False)
```

In [92]:
fsel2 = FeatureSelector(train_df, data_dict=exp_dict)
fsel2.load_state_dict(output_dir)
fts = fsel2.get_rankings(True)

#### build base model, set on features

# issue: feature selector did not consider categorical variables.... since we only have < 5 of them, treat manually

#### hyperparam tuning

#### model eval
---
* evaluation segments
    * `weight`
    * around score cut
    * booked, proxy, others