In [2]:
import sys, json, os, ast
import copy
import numpy as np
import pandas as pd
from smart_open import open
from tqdm import tqdm
import pickle as pkl
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn")

sys.path.insert(1, "../..")
from src.logger import make_logger
from src.dataloader import TabularDataloader
from src.Trainer import LGBMTrainer, TFTrainer

from rdsutils.feature_selection import mrmr
from rdsutils.woe import WOE_Transform
from _utils.feature_selection import feature_selection as fs
from rdsutils import performance_eval as p_eval
from rdsutils.feature_selection import FeatureSelector


# new modules
from _utils.sample_weights import get_sample_weight

%load_ext autoreload
%autoreload 2

### Encode missing
---

In [61]:
target = 'target_v1'
target_indeterminate = 'indeterminate_v1'
weight = "weight"

with open("config.json", "r") as f:
    config = json.load(f)
    
gen3_features = config["data_columns"]["gen3_features"]
display(config.keys()) 
seed = 42

dict_keys(['data', 'meta', 'data_columns', 'model_params', 'model_features', 'impute_vals', 'monotone'])

In [4]:
no_dir = []
def get_monotone_dir(woe_dict):
    result = {}
    for k in woe_dict:
        tbl = woe_dict[k]
        if len(tbl) < 2:
            no_dir.append(k)
        elif tbl.iloc[0]["woe"] < tbl.iloc[1]["woe"]:
            direction = 1
        else:
            direction = -1
        
        result[k] = direction
    return result
print("no directions from woe: ", len(no_dir))

with open("./artifacts/woe_dict.pkl", "rb") as f:
    woe_dict = pkl.load(f)

monotone_dict = get_monotone_dir(woe_dict)
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])
ranking = pd.read_csv("./artifacts/fsel-ranking.csv", index_col=0)
ranking.head()

no directions from woe:  0


Unnamed: 0,mrmr_fcq,mrmr_fcxq,mrmr_ivcq,mrmr_ivcxq,rank_mean,rank_min,shap_ranking,mrmr_shapcq,mrmr_shapcxq,shap_mc_ranking,mrmr_shapcq_mc,mrmr_shapcxq_mc
p13_alj0300,2868,2868,2868,2868,2868.0,2868,1575,2868,2868,1948,2868,2868
p13_alj0313,2868,2868,2868,2868,2868.0,2868,2529,2868,2868,1386,2868,2868
p13_alj0316,2868,2868,2868,2868,2868.0,2868,1104,2868,2868,1049,2868,2868
p13_alj0416,2868,2868,2868,2868,2868.0,2868,25,36,42,21,34,37
p13_alj5030,2868,2868,2868,2868,2868.0,2868,346,2868,2868,258,2868,2868


In [5]:
def get_top_k_features(feature_ranking, rank_col, k):
    s = feature_ranking[rank_col].sort_values().head(k)
    return s.index.to_list()

top_200_fts = get_top_k_features(ranking, "rank_mean", 200)

In [6]:
%%time
benchmarks = ["gen3_score", "applicant_fico_score"]
meta_cols = [target, target_indeterminate, "ri_source",
                      "weight_ri_v1", "weight_cob"]

top_features = sorted(list(set(top_200_fts)))
feature_cols = top_features
cols = feature_cols + meta_cols + benchmarks
cols = list(set(cols))

train_full = pd.read_parquet(config["data"]["clean"]["all_features_dev1"], columns=cols)
test_full = pd.read_parquet(config["data"]["clean"]["all_features_oot1"], columns=cols)

CPU times: user 21.7 s, sys: 50.1 s, total: 1min 11s
Wall time: 1min 30s


In [7]:
col = "ri_source"
from src.preprocess import Preprocess

weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}

In [12]:
%%time  # 2:23 m

df = train_full.copy()

pp = Preprocess(exp_dict)
df = pp.transform(df, feature_cols, weights)
df[["weight", "ri_source"]].groupby("ri_source")["weight"].sum()

100%|██████████| 200/200 [02:18<00:00,  1.44it/s]


CPU times: user 1min 15s, sys: 1min 8s, total: 2min 23s
Wall time: 2min 23s


ri_source
booked    157521.00
others    242180.75
proxy     186924.00
Name: weight, dtype: float64

In [8]:
from joblib import Parallel, delayed
from multiprocessing import cpu_count


def encode_cat_to_missing(df: pd.DataFrame, 
                           features: list,
                           data_dict: pd.DataFrame):
    no_special_cols = []
    manual_treat_ment_required = []
    for col in tqdm(features):
        try:
            cats = data_dict[data_dict.field_name == col].categorical.iloc[0]
            if isinstance(cats, float):  # nan
                no_special_cols.append(col)
                continue
            special_val = ast.literal_eval(cats)
            special_val = [int(i) for i in special_val]
            df[col] = cat_2_nan_series(df[col], special_val) 
        except:
            manual_treatment_required.append(col)
            
    print(len(no_special_cols), len(manual_treatment_required))
    return df


def parallel_df(func, df, series, weights=None):
    n_jobs = min(cpu_count(), len(df.columns))
    col_chunks = np.array_split(range(len(df.columns)), n_jobs)
    lst = Parallel(n_jobs=n_jobs)(
        delayed(func)(df.iloc[:, col_chunk], series, weights) for col_chunk in col_chunks
    )
    return pd.concat(lst)


In [24]:
nr_cpu = cpu_count() - 2

df_ = dd.from_pandas(train_full, npartitions=nr_cpu)

In [9]:
def cat_2_nan_series(series, categorical_list):
    """ given series and a list of catergorical values
    
    replace the categorical occurances to nan
    """
    if len(categorical_list) == 0:
        return series
    mapper = dict((k, np.nan) for k in categorical_list)
    return series.replace(mapper)

In [31]:
col = "t11_tmti05q2"
data_dict = exp_dict
cats = data_dict[data_dict.field_name == col].categorical.iloc[0]
special_val = ast.literal_eval(cats)
special_val = [int(i) for i in special_val]
df_[col] = cat_2_nan_series(df[col], special_val) 

In [32]:
df_.head()

Unnamed: 0,t11_tmti05q2,t11_tmti4216,t11_tall1410,p13_rev5725,p13_col8168,t11_tmti04q4,indeterminate_v1,t11_tstu05q3,t11_tmti0729,p13_mtf0155,...,p13_iqt9535,t11_taxm4203,t11_tbca1505,t11_tall4205,p13_all6971,p13_mtf0153,t11_tbcc1307,t11_tall3205,t11_tall03q3,t11_tbca3526
0,,999999998,997,0,9998,999999998,False,999999996,9998,98,...,994,5360,0,14,1,98,0,5,875,4075
1,,999999998,997,0,9998,999999998,False,999999996,9998,98,...,994,5360,0,14,1,98,0,5,875,4075
2,,999999998,997,0,9998,999999998,False,999999996,9998,98,...,994,5360,0,14,1,98,0,5,875,4075
3,,999999998,997,0,9998,999999998,False,999999996,9998,98,...,994,5360,0,14,1,98,0,5,875,4075
4,,999999997,997,0,9998,999999997,False,999999998,9996,0,...,994,-13995,0,23,30,0,0,12,2530,197


In [21]:
import dask.dataframe as dd

In [60]:
# %%time

# # df_ = dd.read_parquet(config["data"]["clean"]["all_features_dev1"], columns=cols)
# df_ = dd.from_pandas(train_full, chunksize=int(1e7))
# data_dict = exp_dict

# no_special = []
# for col in feature_cols:
#     try:
#         cats = data_dict[data_dict.field_name == col].categorical.iloc[0]
#         special_val = ast.literal_eval(cats)
#         special_val = [int(i) for i in special_val]
#         mapper = dict((k, np.nan) for k in special_val)
#         fn = lambda x: mapper[x] if x in mapper else x
#         df_[col] = df_[col].apply(fn)
#     except Exception as e:
#         print(e)
#         pass
    
#         no_special.append(col)
    
# df_ = df_.compute()

In [None]:
%%time

df_ = dd.read_parquet(config["data"]["clean"]["all_features_dev1"], 
                      columns=cols)

data_dict = exp_dict

for col in feature_cols:
    try:
        cats = data_dict[data_dict.field_name == col].categorical.iloc[0]
        special_val = ast.literal_eval(cats)
        special_val = [int(i) for i in special_val]
        df_[col] = cat_2_nan_series(df_[col], special_val) 
    except:
        pass
    
df_.head()

In [54]:
df_ = train_full.head()
df_

Unnamed: 0,p13_als5400,t11_tcol3553,p13_all7938,t11_tmti3420,t11_tmti05q6,t11_tstu05q8,t11_tall02q1,p13_all5825,t11_tall01q5,t11_tall01q6,...,p13_all7360,p13_all7348,t11_tbca3264,t11_tbca3803,p13_bcc7517,p13_all5840,p13_all7347,t11_tbcc1313,p13_bcc7200,p13_rev7434
0,19000,98,100,98,999999998,999999996,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
1,19000,98,100,98,999999998,999999996,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
2,19000,98,100,98,999999998,999999996,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
3,19000,98,100,98,999999998,999999996,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
4,103200,98,78,97,999999997,999999998,182387,2537,163282,177375,...,0,22,2,6500,0,2537,14,0,100,67


In [57]:
for col in feature_cols:
    try:
        cats = data_dict[data_dict.field_name == col].categorical.iloc[0]
        special_val = ast.literal_eval(cats)
        special_val = [int(i) for i in special_val]
        
        mapper = dict((k, np.nan) for k in special_val)
        fn = lambda x: mapper[x] if x in mapper else x
        df_[col] = df_[col].apply(fn)
        
    except Exception as e:
        print(e)
        pass
    

malformed node or string: nan
malformed node or string: nan
malformed node or string: nan


In [58]:
df_

Unnamed: 0,p13_als5400,t11_tcol3553,p13_all7938,t11_tmti3420,t11_tmti05q6,t11_tstu05q8,t11_tall02q1,p13_all5825,t11_tall01q5,t11_tall01q6,...,p13_all7360,p13_all7348,t11_tbca3264,t11_tbca3803,p13_bcc7517,p13_all5840,p13_all7347,t11_tbcc1313,p13_bcc7200,p13_rev7434
0,19000,,100,,,,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
1,19000,,100,,,,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
2,19000,,100,,,,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
3,19000,,100,,,,65712,791,43731,44711,...,0,0,9,8615,0,791,0,0,100,100
4,103200,,78,,,,182387,2537,163282,177375,...,0,22,2,6500,0,2537,14,0,100,67


#### FSEL

In [24]:
import sys, json, os, ast
import copy
import numpy as np
import pandas as pd
from smart_open import open
from tqdm import tqdm
import pickle as pkl

sys.path.insert(1, "../..")
from src.logger import make_logger
from src.dataloader import TabularDataloader
from src.Trainer import LGBMTrainer, TFTrainer
from src.preprocess import Preprocess

from rdsutils.feature_selection import mrmr
from rdsutils.woe import WOE_Transform
from _utils.feature_selection import feature_selection as fs
from _utils.performance_eval import performance_eval_v3 as p_eval
from rdsutils.feature_selection import FeatureSelector


# new modules
from _utils.sample_weights import get_sample_weight

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
target = 'target_v1'
target_indeterminate = 'indeterminate_v1'
weight = "weight"

In [6]:
with open("config.json", "r") as f:
    config = json.load(f)
    
display(config.keys()) 
seed = 42

dict_keys(['data', 'meta', 'data_columns', 'model_params', 'model_features', 'impute_vals', 'monotone'])

In [7]:
display(config["data"]["clean"].keys())

dict_keys(['all_features_dev1', 'all_features_dev2', 'all_features_oot1', 'all_features_oot2', 'subset_dev1', 'subset_dev2'])

In [8]:
# data dict
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])

In [9]:
dl = TabularDataloader(train_path=config["data"]["clean"]["subset_dev1"])
dl.load_data(debug_size=10000, random_state=seed)

In [10]:
debug_df, _, _ = dl.get_data(debug=True)
train_df, _, _ = dl.get_data(debug=False)
train_df.shape, debug_df.shape

((228188, 5131), (10000, 5131))

In [11]:
gen3_features = config["data_columns"]["gen3_features"]
gen3_params = config["model_params"]["gen3_params"]
if "scale_pos_weight" in gen3_params:
    del gen3_params["scale_pos_weight"]

bureau_fts = config["data_columns"]["bureau_features_cols"] 
cat_fts = ['t11_t3d_segid', 't11_t3d_segid_supp'] # config["data_columns"]["cat_cols"] 
prescreen_fts = bureau_fts + cat_fts

### Preprocessing
---
* [x] create sample weight

#### sample weight

In [12]:
df = train_df.copy()

In [13]:
col = "ri_source"
weights = {"booked": 1,
           "proxy": 1,
           "others": 0.25}

assert sorted(df[col].unique().tolist()) == sorted(list(weights.keys()))

pp = Preprocess(exp_dict)
%time df = pp.transform(df, prescreen_fts, weights)

100%|██████████| 4205/4205 [00:09<00:00, 437.27it/s]


CPU times: user 9.74 s, sys: 0 ns, total: 9.74 s
Wall time: 9.72 s


In [18]:
# lets restrict features to only 200 of them
def get_top_k_features(feature_ranking, rank_col, k):
    s = feature_ranking[rank_col].sort_values().head(k)
    return s.index.to_list()

ranking = pd.read_csv("./artifacts/fsel-ranking.csv", index_col=0)
top_300_fts = get_top_k_features(ranking, "rank_mean", 300)

In [36]:
# from _utils.WeightedCorr import WeightedCorr
from rdsutils.feature_selection import mrmr, WeightedCorr as wc

list_features = top_300_fts 
n_features = 20
weight_col = "weight"

In [45]:
params

{'objective': 'binary',
 'metric': 'auc',
 'boosting': 'gbdt',
 'max_depth': 6,
 'learning_rate': 0.05,
 'min_data_in_leaf': [300],
 'verbosity': 0,
 'seed': 157,
 'n_jobs': 30,
 'n_estimators': 800}

In [None]:
%%time
import shap
import lightgbm as lgb

# # woe dict
woe = WOE_Transform(method = 'tree',min_iv = 0.01)
woe.fit(df[list_features], df[target].astype(int), Y_weight=df[weight], display=-1)
df_iv = woe.get_iv()
woe_dict = woe.woe_dict()

iv_tbl = df_iv[df_iv.attr.isin(list_features)][['attr','iv']].set_index('attr')
iv_series = iv_tbl.iv

In [49]:
%%time
# shap
params = {'objective': 'binary',
 'metric': 'auc',
 'boosting': 'gbdt',
 'max_depth': 6,
 'learning_rate': 0.05,
 'min_data_in_leaf': [300],
 'verbosity': -1,
 'seed': 157,
 'n_jobs': 30,
 'n_estimators': 300}

lgbm = lgb.LGBMClassifier(**params)

trainer = LGBMTrainer()
trainer.train(lgbm, 
              df,
              features = list_features,
              target_col = target,
              sample_weight = df[weight]
             )
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(df[list_features])



In [50]:
# shap with mc
def get_monotone_dir(woe_dict):
    result = {}
    for k in woe_dict:
        tbl = woe_dict[k]
        if len(tbl) < 2:
            print(k, len(tbl))
        elif tbl.iloc[0]["woe"] < tbl.iloc[1]["woe"]:
            direction = 1
        else:
            direction = -1
        
        result[k] = direction
    return result

monotone_dict = get_monotone_dir(woe_dict)
mc = [monotone_dict[ft] for ft in list_features]

params_mc = {'objective': 'binary',
 'metric': 'auc',
 'boosting': 'gbdt',
 'max_depth': 6,
 'learning_rate': 0.05,
 'min_data_in_leaf': [300],
 'verbosity': -1,
 'seed': 157,
 'n_jobs': 30,
 'n_estimators': 300,
 'monotone_constraints': mc}

lgbm_mc = lgb.LGBMClassifier(**params_mc)

trainer_mc = LGBMTrainer()
trainer_mc.train(lgbm_mc, 
              df,
              features = list_features,
              target_col = target,
              sample_weight = df[weight]
             )
explainer_mc = shap.TreeExplainer(lgbm_mc)
shap_values_mc = explainer_mc.shap_values(df[list_features])



In [65]:
def get_feature_shap_abs(shap_values, columns):
    result = pd.DataFrame(shap_values, columns=columns)\
                    .apply(lambda x: np.abs(x).mean(), axis=0)\
                    .sort_values(ascending=False)
    return result

shap_features = get_feature_shap_abs(shap_values[1], list_features)
shap_features_mc = get_feature_shap_abs(shap_values_mc[1], list_features)

In [102]:
fts_ = list_features
shap_fn = lambda X,y,w: shap_features.loc[fts_]
shap_mc_fn = lambda X,y,w: shap_features_mc.loc[fts_]
iv_fn = lambda X,y,w: iv_series.loc[fts_]

In [83]:
ranking = pd.DataFrame(index = list_features)

In [104]:
%%time
# weighted
mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=shap_mc_fn,
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_shapcq_mc_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=shap_fn,
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_shapcq_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=iv_fn,
                                      redundancy="c",
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_ivcq_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

100%|██████████| 20/20 [00:26<00:00,  1.31s/it]

CPU times: user 14.8 s, sys: 10.5 s, total: 25.3 s
Wall time: 26.7 s





In [105]:
%%time
# weighted max
mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=shap_mc_fn,
                                      denominator="max",
                                      K=n_features)
ranking["mrmr_shapcxq_mc_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=shap_fn,
                                      denominator="max",
                                      K=n_features)
ranking["mrmr_shapcxq_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      weights=df["weight"],
                                      relevance=iv_fn,
                                      redundancy="c",
                                      denominator="max",
                                      K=n_features)
ranking["mrmr_ivcxq_w"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

100%|██████████| 20/20 [00:26<00:00,  1.31s/it]

CPU times: user 14.5 s, sys: 10.7 s, total: 25.2 s
Wall time: 26.6 s





In [106]:
%%time
# un-weighted
mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      relevance=shap_mc_fn,
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_shapcq_mc"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      relevance=shap_fn,
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_shapcq"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

mrmr_features_ = mrmr.mrmr_classif(df[fts_], 
                                      df[target],
                                      relevance=iv_fn,
                                      redundancy="c",
                                      denominator="mean",
                                      K=n_features)
ranking["mrmr_ivcq"] = fs.get_mrmr_rankings(mrmr_features_, list_features, ranking)

100%|██████████| 20/20 [00:22<00:00,  1.10s/it]

CPU times: user 12.3 s, sys: 9.05 s, total: 21.3 s
Wall time: 22.5 s





In [107]:
ranking.shape

(300, 10)

In [108]:
ranking["mean_ranking"] = ranking.mean(axis=1)
ranking.head()

Unnamed: 0,mrmr_shapcq_mc_w,mrmr_shapcq_w,mean_ranking,mrmr_shapcxq_mc_w,mrmr_shapcxq_w,mrmr_shapcq_mc,mrmr_shapcq,mrmr_ivcq_w,mrmr_ivcxq_w,mrmr_ivcq
p13_bcc5520,1,1,1.0,1,1,1,1,1,1,1
t11_tmti04q4,300,300,300.0,300,300,300,300,300,300,300
p13_bcc5421,300,300,300.0,300,300,300,300,300,300,300
p13_bcc7518,300,300,300.0,300,300,300,300,300,300,300
p13_reh5420,300,300,300.0,300,300,300,300,300,300,300


In [109]:
ranking_ = ranking[ranking.mean_ranking < 300]

In [112]:
ranking_[sorted(ranking_.columns)].sort_values("mean_ranking").head(10)

Unnamed: 0,mean_ranking,mrmr_ivcq,mrmr_ivcq_w,mrmr_ivcxq_w,mrmr_shapcq,mrmr_shapcq_mc,mrmr_shapcq_mc_w,mrmr_shapcq_w,mrmr_shapcxq_mc_w,mrmr_shapcxq_w
p13_bcc5520,1.0,1,1,1,1,1,1,1,1,1
t11_taua4216,36.114286,20,300,12,5,4,4,4,4,4
t11_tiln4755,39.442857,300,16,16,6,12,5,5,20,6
t11_trev0722,66.821429,300,18,300,7,6,7,8,7,8
p13_mta5400,76.907143,10,13,300,17,17,19,18,300,17
t11_tiln0722,91.742857,300,300,300,4,3,2,2,2,2
t11_taua0726,94.778571,300,300,300,9,7,6,7,5,7
t11_tall3205,95.414286,300,300,300,19,16,3,3,3,3
t11_tstu05q2,95.842857,300,300,300,10,5,8,10,6,11
p13_iqf9510,97.485714,300,300,300,12,9,13,11,9,10


In [230]:
nr_to_select = 10
nr_to_consider = 5
fts_ = list_features[:nr_to_select]

fsel = FeatureSelector(df, data_dict=exp_dict)
rankings = fsel.run(fts_, target, weight, nr_to_consider, nr_to_select,
                    output_dir="./artifacts/dev1_fsel_1", filter_by_logic_expn=True)

target_col: target_v1
weight_col: weight
Preprocessing... generating iv and shaps
prepping woe...
processed  10  num attributes

prepping lgbm shap
prepping lgbm mc shap


100%|██████████| 9/9 [00:01<00:00,  8.52it/s]


filtering features by logic - experian
dropping 0 features : kept 10 features
    reason:  not AA
0 features with greater than                 0.95 missing values
dropping 0 features : kept 10 features
    reason:  too many missing
dropping 0 features : kept 10 features
    reason:  low_iv
running many to few


100%|██████████| 5/5 [00:13<00:00,  2.80s/it]
100%|██████████| 5/5 [00:13<00:00,  2.79s/it]
100%|██████████| 5/5 [00:13<00:00,  2.78s/it]
100%|██████████| 9/9 [00:00<00:00, 11.55it/s]


saving ranking.csv
running fsel on few


100%|██████████| 9/9 [00:01<00:00,  8.28it/s]

saving ranking.csv





In [166]:
from _utils.feature_selection.feature_selection import FeatureSelector

In [196]:
%%time

nr_to_select = 10
fts_ = list_features[:nr_to_select]
fsel = FeatureSelector(df)
fsel.preprocess(fts_, target, weight)
fsel.save_state_dict("./artifacts/dev1_fsel_1")

Preprocessing... generating iv and shaps
prepping woe...
processed  10  num attributes

prepping lgbm shap
prepping lgbm mc shap


100%|██████████| 9/9 [00:01<00:00,  7.00it/s]

CPU times: user 9min 48s, sys: 143 ms, total: 9min 48s
Wall time: 26.1 s





In [197]:
rankings = fsel.many_to_few(fts_, target, weight, nr_to_select)
fsel.save_state_dict("./artifacts/dev1_fsel_1")

100%|██████████| 10/10 [00:24<00:00,  2.49s/it]
100%|██████████| 10/10 [00:24<00:00,  2.49s/it]
100%|██████████| 10/10 [00:25<00:00,  2.52s/it]
100%|██████████| 9/9 [00:01<00:00,  7.07it/s]

saving ranking.csv





In [198]:
rankings_imp = fsel.get_rankings(True)
rankings_imp["mean"] = rankings_imp.mean(axis=1)
rankings_imp.sort_values("mean", inplace=True)
display(rankings_imp.head())

top_features = rankings_imp.index.to_list()# a subset of features

Unnamed: 0,mrmr_shapcq_mc,mrmr_shapcq,mrmr_ivcq,mean
p13_bcc5520,1,1,1,1.0
t11_tmti04q4,2,2,2,2.0
t11_tbca2322,3,3,3,3.0
p13_all7938,4,4,4,4.0
p13_bcc7518,5,5,5,5.0


In [199]:
%%time
rankings = fsel.fsel_on_few(top_features, target, weight)

CPU times: user 56.7 s, sys: 69.2 ms, total: 56.8 s
Wall time: 3.17 s


In [200]:
fsel.save_state_dict("./artifacts/dev1_fsel_1")

100%|██████████| 9/9 [00:01<00:00,  7.12it/s]

saving ranking.csv





In [201]:
rankings.head()

Unnamed: 0,mrmr_shapcq_mc,mrmr_shapcq,mrmr_ivcq,lgbm_shap_10,lgbm_shap_mc_10
p13_bcc5520,1,1,1,2,6
t11_tmti04q4,2,2,2,4,4
p13_bcc5421,10,8,9,5,5
p13_bcc7518,5,5,5,8,9
p13_reh5420,9,10,7,6,7


In [202]:
rankings

Unnamed: 0,mrmr_shapcq_mc,mrmr_shapcq,mrmr_ivcq,lgbm_shap_10,lgbm_shap_mc_10
p13_bcc5520,1,1,1,2,6
t11_tmti04q4,2,2,2,4,4
p13_bcc5421,10,8,9,5,5
p13_bcc7518,5,5,5,8,9
p13_reh5420,9,10,7,6,7
t11_tbca2322,3,3,3,9,10
p13_all7938,4,4,4,10,8
p13_all4520,7,7,10,7,1
p13_all5321,6,6,6,1,3
t11_tbca2526,8,9,8,3,2


In [212]:
from _utils.feature_selection.feature_selection import get_feature_by_lgbm_importance

lgbm = lgb.LGBMClassifier(**params)
trainer = LGBMTrainer()
trainer.train(lgbm, 
              df,
              features=fts_,
              target_col=target,
              sample_weight=df[weight]
             )
corr_matrix = wc.WeightedCorr(df=df[fts_+[weight]], 
                           wcol=weight)(method='pearson')
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
features = get_feature_by_lgbm_importance(lgbm)

In [148]:
# select top features and then 

# run lgbm and then remove corr?
len(fts_)

300