In [2]:
import sys, json, os, ast
import copy
import numpy as np
import pandas as pd
from smart_open import open
from tqdm import tqdm
import pickle as pkl

sys.path.insert(1, "../..")
from src.logger import make_logger
from src.dataloader import TabularDataloader
from src.Trainer import LGBMTrainer, TFTrainer
from src.preprocess import Preprocess

from rdsutils.feature_selection import mrmr
from rdsutils.woe import WOE_Transform
from _utils.feature_selection import feature_selection as fs
from _utils.performance_eval import performance_eval_v3 as p_eval
from rdsutils.feature_selection import FeatureSelector as general_purpose_fsel
from src.feature_selection import FeatureSelector  # to be moved to rdsutils

# new modules
from _utils.sample_weights import get_sample_weight

%load_ext autoreload
%autoreload 2

In [3]:
target = 'target_v2'
target_indeterminate = 'indeterminate_v2'
weight = "weight"
seed = 42

with open("config.json", "r") as f:
    config = json.load(f)
    
display(config.keys()) 

gen3_features = config["data_columns"]["gen3_features"]
gen3_params = config["model_params"]["gen3_params"]
if "scale_pos_weight" in gen3_params:
    del gen3_params["scale_pos_weight"]

bureau_fts = config["data_columns"]["bureau_features_cols"] 
cat_fts = ['t11_t3d_segid', 't11_t3d_segid_supp'] # config["data_columns"]["cat_cols"] 
prescreen_fts = bureau_fts + cat_fts

dict_keys(['data', 'meta', 'data_columns', 'model_params', 'model_features', 'impute_vals', 'monotone'])

In [4]:
display(config["data"]["clean"].keys())

dict_keys(['all_features_dev1', 'all_features_dev2', 'all_features_oot1', 'all_features_oot2', 'subset_dev1', 'subset_dev2'])

#### load data and features

In [4]:
%%time
# data dict
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])

# fsel data - sampled
dl = TabularDataloader(train_path=config["data"]["clean"]["subset_dev2"])
dl.load_data(debug_size=10000, random_state=seed)

debug_df, _, _ = dl.get_data(debug=True)
train_df, _, _ = dl.get_data(debug=False)
train_df.shape, debug_df.shape

CPU times: user 31.6 s, sys: 1min 48s, total: 2min 20s
Wall time: 44.6 s


((215815, 5131), (10000, 5131))

#### get sample weights

In [5]:
col = "ri_source"
weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}

assert sorted(train_df[col].unique().tolist()) == sorted(list(weights.keys()))

pp = Preprocess(exp_dict)
train_df["weight_eval"] = train_df["weight_cob"] * train_df["weight_ri_v2"]
%time train_df = pp.transform(train_df, prescreen_fts, weights, 
                              drop_indeterminate=target_indeterminate, 
                              existing_weights_col="weight_eval")


100%|██████████| 4205/4205 [00:09<00:00, 427.92it/s]



        added columns:
            weight: training sample_weight scaled using provided weights by ri_source
                weight_eval * weight_sample
        
dropping indeterminate col: indeterminate_v2
CPU times: user 14.1 s, sys: 4.74 s, total: 18.9 s
Wall time: 18.1 s


In [6]:
# look at weights
display(train_df[["weight", "ri_source"]].groupby("ri_source")["weight"].sum())
display(train_df[["weight_eval", "ri_source"]].groupby("ri_source")["weight_eval"].sum())

ri_source
booked    10581.500
others    20378.625
proxy     20044.500
Name: weight, dtype: float64

ri_source
booked    10581.5
others    81514.5
proxy     20044.5
Name: weight_eval, dtype: float64

#### feature selection

In [7]:
%%time
# %%capture record

nr_to_consider = 200
nr_to_select = 50
fsel_dir = "./artifacts/dev2_fsel_2"

fsel = FeatureSelector(train_df, data_dict=exp_dict)
rankings = fsel.run(prescreen_fts, target, weight, nr_to_consider, nr_to_select,
                    output_dir=fsel_dir, filter_by_logic_expn=True)

# with open("./artifacts/dev1_fsel_1/log.txt", "w") as f:
#     f.write(record.stdout)

target_col: target_v2
weight_col: weight
Preprocessing... generating iv and shaps
prepping woe...
[32mAttrs removed--missing pct>99%:  [0m ['p13_all8162', 'p13_all8163', 'p13_all8380', 'p13_all8723', 'p13_all9222', 'p13_all9223', 'p13_all9230', 'p13_all9239', 'p13_all9240', 'p13_all9249', 'p13_all9260', 'p13_all9280', 'p13_aua8162', 'p13_aua8163', 'p13_bca0401', 'p13_bca5021', 'p13_bca6201', 'p13_col8194', 'p13_hlc5021', 'p13_iln0403', 'p13_mtf8169', 'p13_mtf8656', 'p13_mts8151', 'p13_rpm5020', 'p13_rpm5320', 'p13_rpm5820', 'p13_rpm6160', 'p13_rpm7110', 'p13_rti5020', 'p13_rti5320', 'p13_rti5820', 'p13_uti5030', 'p13_uti5530', 'p13_uti8151', 't11_tall1412', 't11_tall1413', 't11_tall2412', 't11_tcol2556', 't11_tcol2567', 't11_tcol3567', 't11_tmti0451', 't11_tmti0452', 't11_tmti0453', 't11_tmti0454', 't11_tmti0455', 't11_tmti0456', 't11_tmti0457', 't11_tmti0458', 't11_tstu0909']
processed  4156  num attributes



100%|██████████| 7/7 [00:01<00:00,  6.27it/s]


prepping lgbm shap


100%|██████████| 7/7 [00:00<00:00,  7.08it/s]


prepping lgbm mc shap
p13_all9123 1 no monotonic direction - probably should filter out
p13_all9130 1 no monotonic direction - probably should filter out
p13_all9134 1 no monotonic direction - probably should filter out
p13_all9135 1 no monotonic direction - probably should filter out
p13_all9138 1 no monotonic direction - probably should filter out
p13_all9139 1 no monotonic direction - probably should filter out
p13_all9140 1 no monotonic direction - probably should filter out
p13_all9141 1 no monotonic direction - probably should filter out
p13_all9144 1 no monotonic direction - probably should filter out
p13_all9145 1 no monotonic direction - probably should filter out
p13_all9148 1 no monotonic direction - probably should filter out
p13_all9149 1 no monotonic direction - probably should filter out
p13_all9171 1 no monotonic direction - probably should filter out
p13_all9177 1 no monotonic direction - probably should filter out
p13_all9178 1 no monotonic direction - probably should

100%|██████████| 7/7 [00:00<00:00,  7.28it/s]


filtering features by logic - experian
dropping 530 features : kept 3675 features
    reason:  not AA
162 features with greater than                 0.95 missing values
dropping 162 features : kept 3513 features
    reason:  too many missing
dropping 565 features : kept 2948 features
    reason:  low_iv
running many to few


100%|██████████| 200/200 [09:47<00:00,  2.94s/it]
100%|██████████| 200/200 [12:01<00:00,  3.61s/it]
100%|██████████| 200/200 [23:49<00:00,  7.15s/it]
100%|██████████| 7/7 [00:01<00:00,  4.50it/s]


saving ranking.csv
running fsel on few
p13_all9123 1 no monotonic direction - probably should filter out
p13_all9130 1 no monotonic direction - probably should filter out
p13_all9134 1 no monotonic direction - probably should filter out
p13_all9135 1 no monotonic direction - probably should filter out
p13_all9138 1 no monotonic direction - probably should filter out
p13_all9139 1 no monotonic direction - probably should filter out
p13_all9140 1 no monotonic direction - probably should filter out
p13_all9141 1 no monotonic direction - probably should filter out
p13_all9144 1 no monotonic direction - probably should filter out
p13_all9145 1 no monotonic direction - probably should filter out
p13_all9148 1 no monotonic direction - probably should filter out
p13_all9149 1 no monotonic direction - probably should filter out
p13_all9171 1 no monotonic direction - probably should filter out
p13_all9177 1 no monotonic direction - probably should filter out
p13_all9178 1 no monotonic direction 

100%|██████████| 7/7 [00:01<00:00,  6.41it/s]

saving ranking.csv
CPU times: user 1h 41min 37s, sys: 22min 48s, total: 2h 4min 25s
Wall time: 1h 4min 10s





##### get fsel results

* run `fsel.get_rankings(True)` to get ranking_df of features that is ever selected.


##### if computation already made, we can just load it

```python
# initialte project and load back
fsel = FeatureSelector(train_df, data_dict=exp_dict)
fsel.load_state_dict(fsel_dir)
```

##### this is the logic underneath fsel.run

```python
# setup
features = prescreen_fts
target_col = target
weight_col = weight
output_dir = fsel_dir
corr_threshold = 0.8
filter_by_logic_expn = True

# first preprocessing
fsel.preprocess(features, target_col, weight_col, output_dir=output_dir)

if filter_by_logic_expn:
    print("filtering features by logic - experian")
    features = fsel.filter_by_logic_expn(features, target_col, weight_col)

fsel.many_to_few(features, target_col, weight_col, nr_to_consider)
if output_dir: fsel.save_state_dict(output_dir)

# get top <nr_to_select> features by mean just as a rule of a thumb
rankings_imp = fsel.get_rankings(True)
rankings_imp["<mean>"] = rankings_imp.mean(axis=1)
rankings_imp.sort_values("<mean>", inplace=True)
top_features = rankings_imp.index.to_list()
rankings_imp.drop("<mean>", axis=1, inplace=True)

# to approximate number of features to consider so
# we end up nr_to_select features when using the less efficient 
# methods

approx_nr_to_select = int(nr_to_select / (corr_threshold+0.001))

fsel.fsel_on_few(top_features[:approx_nr_to_select], target_col, 
                 weight_col, corr_threshold=corr_threshold)
if output_dir: fsel.save_state_dict(output_dir)

rankings = fsel.get_rankings(False)
```

In [8]:
fsel2 = FeatureSelector(train_df, data_dict=exp_dict)
fsel2.load_state_dict(fsel_dir)
fts = fsel2.get_rankings(True)

#### build base model, set on features

# issue: feature selector did not consider categorical variables.... since we only have < 5 of them, treat manually

#### hyperparam tuning

#### model eval
---
* evaluation segments
    * `weight`
    * around score cut
    * booked, proxy, others

In [None]:
# hand made shap

import shap
import lightgbm as lgb

default_params = {
 'objective': 'binary',
 'metric': 'auc',
 'boosting': 'gbdt',
 'max_depth': 6,
 'learning_rate': 0.05,
 'min_data_in_leaf': [300],
 'verbosity': -1,
 'seed': 157,
 'n_jobs': 30,
 'n_estimators': 1000
}

lgbm = lgb.LGBMClassifier(**default_params)

list_features = rankings.index.to_list()
trainer = LGBMTrainer()
trainer.train(lgbm, 
              train_df,
              features = list_features,
              target_col = target,
              sample_weight = train_df[weight]
             )
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(train_df[list_features])

shap_features = pd.DataFrame(shap_values[1], columns=list(train_df[list_features]))\
                    .apply(lambda x: np.abs(x).mean(), axis=0)\
                    .sort_values(ascending=False)

benchmark_fts = shap_features.index.to_list()[:32]

In [30]:

shap_features = pd.DataFrame(shap_values[1], columns=list(train_df[list_features]))\
                    .apply(lambda x: np.abs(x).mean(), axis=0)\
                    .sort_values(ascending=False)

benchmark_fts = shap_features.index.to_list()[:40]

In [31]:
features = benchmark_fts
from rdsutils.feature_selection.WeightedCorr import WeightedCorr

corr_matrix = WeightedCorr(df=train_df[benchmark_fts+[weight]], wcol=weight)("pearson").abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
reduced_shap_features = [f for f in features if f not in (to_drop)]
print(len(reduced_shap_features))

36


In [32]:
len(reduced_shap_features)

36

In [1]:
with open("./artifacts/models/model_params.json", "r") as f:
    model_params = json.load(f)

NameError: name 'json' is not defined

In [51]:
fts_ = reduced_shap_features[:32]
params_ = copy.deepcopy(model_params["dev1_v1_benchmark"]["params"])

fts_mc = fts_
params_mc = copy.deepcopy(params_)
params_mc["monotone_constraints"] = mc

In [53]:
dev2_v2_benchmark = {"features": fts_,
                     "params": params_,
                     "model_type": "lightgbm"}
dev2_v2_benchmark_mc = {"features": fts_mc,
                     "params": params_mc,
                     "model_type": "lightgbm"}
model_params["dev2_v2_benchmark"] = dev2_v2_benchmark
model_params["dev2_v2_benchmark_mc"] = dev2_v2_benchmark_mc

In [55]:
with open("./artifacts/models/model_params.json", "w") as f:
    json.dump(model_params, f, indent=4)

In [48]:
def get_monotone_dir(woe_dict):
    result = {}
    for k in woe_dict:
        tbl = woe_dict[k]
        if len(tbl) < 2:
            print(k, len(tbl))
        elif tbl.iloc[0]["woe"] < tbl.iloc[1]["woe"]:
            direction = 1
        else:
            direction = -1
        
        result[k] = direction
    return result

with open("./artifacts/dev2_fsel_v2/woe_dict.pkl", "rb") as f:
    woe_dict = pkl.load(f)

monotone_dict = get_monotone_dir(woe_dict)
mc = [monotone_dict[ft] for ft in fts_mc]


p13_all9123 1
p13_all9130 1
p13_all9134 1
p13_all9135 1
p13_all9138 1
p13_all9139 1
p13_all9140 1
p13_all9141 1
p13_all9144 1
p13_all9145 1
p13_all9148 1
p13_all9149 1
p13_all9171 1
p13_all9177 1
p13_all9178 1
p13_all9180 1
p13_all9187 1
p13_all9188 1
p13_all9189 1
p13_all9330 1
p13_all9340 1
p13_all9380 1


In [50]:
mc

[-1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1]