## Feature Selection
---

* use Boruta to select top 100 features overall
* select top 10 features from each attrbute group

In [1]:
import sys, os, json
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle as pkl
from tqdm import tqdm
from smart_open import open
import rdsutils
from rdsutils.feature_selection import FeatureSelector 

sys.path.insert(1, "..")
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)

import src
from src.utils.data_dictionary import ExperianDataDict

%load_ext autoreload
%autoreload 2

In [2]:
with open(os.path.join("../config.json"), "r") as f:
    config = json.load(f)

In [3]:
df_train = pd.read_parquet(config["data"]["df_train"])
df_valid = pd.read_parquet(config["data"]["df_valid"])
df_test = pd.read_parquet(config["data"]["df_test"])

In [4]:
dd = pd.read_csv(config["meta"]["data_dict_path"])

bureau_features = config["data_columns"]["bureau_features"]
meta_cols = config["data_columns"]["meta_cols"]
target_col = config["data_columns"]["target_cols"][0]

### Currently temporarily get data for Tabular Pipelines. 
---

For now, only select features used in PLGen3, and save them separately to cleaned data fields

Do the following processing later.

In [5]:
gen3_features = config["data_columns"]["gen3_features"]
meta_cols = config["data_columns"]["meta_cols"]
target_cols = config["data_columns"]["target_cols"]
cols = gen3_features + meta_cols + target_cols

In [31]:
# train_data = df_train[cols]
# valid_data = df_valid[cols]
# test_data = df_test[cols]

# # drop indeterminates
# train_data = train_data[train_data[target_col].between(0,1)]
# valid_data = valid_data[valid_data[target_col].between(0,1)]
# test_data = test_data[test_data[target_col].between(0,1)]

In [32]:
# s3_path = config["meta"]["data_dir"]
# if "data_processed" not in config:
#     config["data_processed"] = {}

# dpath = os.path.join(s3_path, "cleaned", "train_df.parquet")
# train_data.to_parquet(dpath)
# config["data_processed"]["train_df"] = dpath

# dpath = os.path.join(s3_path, "cleaned", "valid_df.parquet")
# valid_data.to_parquet(dpath)
# config["data_processed"]["valid_df"] = dpath

# dpath = os.path.join(s3_path, "cleaned", "test_df.parquet")
# test_data.to_parquet(dpath)
# config["data_processed"]["test_df"] = dpath

In [33]:
# with open(os.path.join("../config.json"), "w") as f:
#     json.dump(config, f, indent=4)
    
# with open(os.path.join(config["meta"]["data_dir"], "config.json"), "w") as f:
#     json.dump(config, f, indent=4)

### Initial Processing
---

Drop
* single unique value
* significant missing  > 50%? - majority as special value
* not AA able

In [6]:
fsel = FeatureSelector(df_train, 
                       label_cols=target_col,
                       feature_cols=bureau_features)

In [7]:
ft_one_unique = fsel.get_single_unique()

missing_rate_threshold = 0.8
ft_missing = fsel.get_missing(missing_rate_threshold)

ft_not_aaable = dd[dd["adverse actionable"] != "Y"][["field_name", "adverse actionable"]]
ft_not_aaable = ft_not_aaable[ft_not_aaable.isin(bureau_features)]
ft_not_aaable.columns = ["feature", "adverse_actionable"]
print(f"{len(ft_not_aaable)} features are not aa able")

82 features with a single unique value.
484 features with greater than                 0.8 missing values
703 features are not aa able


In [8]:
# remove those features

def remove_features(candidates, to_remove, reason=None):
    result = sorted(list(set(candidates) - set(to_remove)))
    ndroped = len(candidates) - len(result)
    print(f"dropping {ndroped} features : kept {len(result)} features")
    print(f"    reason:  {reason}")
    return result
    
candidates = remove_features(bureau_features, 
                             ft_one_unique.feature.values,
                             "single unique value")
candidates = remove_features(candidates, 
                             ft_missing.feature.values, 
                             f"missing > {missing_rate_threshold}")
candidates = remove_features(candidates, 
                             ft_not_aaable.feature.values,
                             "not aa able")

dropping 82 features : kept 2645 features
    reason:  single unique value
dropping 478 features : kept 2167 features
    reason:  missing > 0.8
dropping 134 features : kept 2033 features
    reason:  not aa able


In [9]:
len(candidates)

2033

### Over all
---

In [None]:
%%time
model = fsel.get_default_lgb_estimator('classification')
bimp = fsel.get_boruta_importance(model, features=candidates, 
                                  verbose=1, random_state=42)
fsel.record_boruta_importance.head()

In [None]:
os.makedirs("../artifact", exist_ok=True)
fsel.record_boruta_importance.to_csv("../artifact/boruta_importance.csv")

In [None]:
fsel.get_woe(method='equal', num_bin_start=10, min_iv=0.02, min_samples_leaf=100, display=0) 
fsel.get_iv(method='equal', num_bin_start=10, min_iv=0.02, min_samples_leaf=100, display=0)

In [None]:
len(fsel.woe.woe_dict())

In [None]:
fsel.save('../artifact/fsel.pkl') 

### By Attrbute types
---

In [12]:
dd.groupby('table_name').attr_grp.value_counts()

table_name       attr_grp                         
premier_1_3      Delinquent Trade Count               496
                 Balance Amount Payment               413
                 Age/Recency                          253
                 Trade Count                          219
                 Ratios/Percentage                    196
                 Public Records/Inquiry               149
                 Other Criteria Counts                132
                 Rank                                 105
                 Occurrence                            99
                 Satisfactory Trade Count              58
trended_3d       trended_3d                           609
trended_3d_v1_1  Quarterly average                    546
                 Payment variance                     318
                 Payment magnitude                    264
                 Balance variance                     245
                 Balance migration activity           104
                 Bala

In [22]:
dd_cand = dd[dd.field_name.isin(candidates)]
groups = dd_cand[["table_name", "attr_grp"]].copy().drop_duplicates()# .unique()

In [28]:
dd_cand.groupby('table_name').attr_grp.value_counts()

table_name   attr_grp                
premier_1_3  Delinquent Trade Count      437
             Balance Amount Payment      311
             Trade Count                 195
             Ratios/Percentage           179
             Age/Recency                 131
             Other Criteria Counts       122
             Public Records/Inquiry       98
             Rank                         84
             Occurrence                   77
             Satisfactory Trade Count     51
trended_3d   trended_3d                  348
Name: attr_grp, dtype: int64

In [49]:
from tqdm import tqdm

fsels = {}
os.makedirs("../artifact", exist_ok=True)

for _, tname, grp in tqdm(groups.to_records()):
    features_ = dd_cand[(dd_cand.table_name==tname) 
                        & (dd_cand.attr_grp==grp)].field_name.unique()
    
    fsel_ = FeatureSelector(df_train, 
                            label_cols=target_col,
                            feature_cols=features_)
    
    model = fsel_.get_default_lgb_estimator('classification')
    bimp = fsel_.get_boruta_importance(model, max_iter=20,
                                       verbose=1, random_state=42)
    
    tname = tname.replace("/", "_")
    grp = grp.replace("/", "_")
    fsel_.record_boruta_importance.to_csv(f"../artifact/boruta_importance_{tname}_{grp}.csv")
    
    fsels[(tname, grp)] = fsel_

  0%|          | 0/11 [00:00<?, ?it/s]

Running iteration number 20.9.

  9%|▉         | 1/11 [07:12<1:12:03, 432.38s/it]

Running iteration number 20.9.

 18%|█▊        | 2/11 [25:38<2:04:18, 828.72s/it]

Running iteration number 20.9.

 27%|██▋       | 3/11 [36:19<1:39:02, 742.76s/it]

Running iteration number 8.7..

 36%|███▋      | 4/11 [38:52<59:31, 510.23s/it]  

Running iteration number 20.9.

 45%|████▌     | 5/11 [43:10<41:55, 419.25s/it]

Running iteration number 20.9.

 55%|█████▍    | 6/11 [47:11<29:52, 358.52s/it]

Running iteration number 4.3..

 64%|██████▎   | 7/11 [48:18<17:33, 263.35s/it]

Running iteration number 20.9.

 73%|███████▎  | 8/11 [51:25<11:56, 238.81s/it]

Running iteration number 20.9.

 82%|████████▏ | 9/11 [1:01:25<11:43, 351.87s/it]

Running iteration number 20.9.

 91%|█████████ | 10/11 [1:05:25<05:17, 317.23s/it]

Running iteration number 20.9.

100%|██████████| 11/11 [1:26:12<00:00, 470.22s/it]

Completed iteration number 20.


