## Feature Analysis
---

In [1]:
import sys, os, json
sys.path.insert(1, "../..")

import pickle as pkl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.governance import get_feature_by_importance

#### rank features by importance

In [2]:
with open("config.json", "r") as f:
    config = json.load(f)

exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])

with open("./artifacts/models/lightgbm-combined_v1_benchmark_mc.pkl", "rb") as f:
    lgbm_v1 = pkl.load(f)
    
with open("./artifacts/models/lightgbm-combined_v2_benchmark_mc.pkl", "rb") as f:
    lgbm_v2 = pkl.load(f)
    
with open("./artifacts/models/uw_combined_target1_lgbm_mc_benchmark.pkl", "rb") as f:
    cs_v1 = pkl.load(f)
    
with open("./artifacts/models/uw_combined_target2_lgbm_mc_benchmark.pkl", "rb") as f:
    cs_v2 = pkl.load(f)
    
feature_by_imp_v1 = get_feature_by_importance(lgbm_v1)
feature_by_imp_v2 = get_feature_by_importance(lgbm_v2)
feature_by_imp_cs_v1 = get_feature_by_importance(cs_v1)
feature_by_imp_cs_v2 = get_feature_by_importance(cs_v2)

Trying to unpickle estimator LabelEncoder from version 0.24.1 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


#### get feature dict - ordered by rank

In [3]:
def get_feature_descriptions(features, data_dict, lgbm):
    dd_cols = ["field_name", "description", "long description", "attr_grp", "categorical", "type"]
    dd = data_dict[dd_cols]
    dd = dd[dd.field_name.isin(lgbm.feature_name_)]
    dd.set_index("field_name", inplace=True)
    
    dd_fields = set(data_dict.field_name.unique())
    bureau_fts = [f for f in features if f in dd_fields]
    bureau_dd = dd.loc[bureau_fts]
    
    if len(lgbm.feature_name_) == len(bureau_fts):
        return bureau_dd
    
    custom_ft = [f for f in features if f not in dd_fields]
    custom_dd = pd.DataFrame(index=custom_ft)
    return pd.concat([bureau_dd, custom_dd]).loc[features]

In [4]:
feature_dict_v1 = get_feature_descriptions(feature_by_imp_v1, exp_dict, lgbm_v1)
dir_map = dict(zip(lgbm_v1.feature_name_, lgbm_v1.get_params()["monotone_constraints"]))
feature_dict_v1["dir"] = [dir_map[k] for k in feature_dict_v1.index.values]
feature_dict_v1["rank"] = range(1, len(feature_dict_v1)+1)

feature_dict_v2 = get_feature_descriptions(feature_by_imp_v2, exp_dict, lgbm_v2)
dir_map = dict(zip(lgbm_v2.feature_name_, lgbm_v2.get_params()["monotone_constraints"]))
feature_dict_v2["dir"] = [dir_map[k] for k in feature_dict_v2.index.values]
feature_dict_v2["rank"] = range(1, len(feature_dict_v2)+1)

cs_feature_dict_v1 = get_feature_descriptions(feature_by_imp_cs_v1, exp_dict, cs_v1)
dir_map = dict(zip(cs_v1.feature_name_, cs_v1.get_params()["monotone_constraints"]))
cs_feature_dict_v1["dir"] = [dir_map[k] for k in cs_feature_dict_v1.index.values]
cs_feature_dict_v1["rank"] = range(1, len(cs_feature_dict_v1)+1)

cs_feature_dict_v2 = get_feature_descriptions(feature_by_imp_cs_v2, exp_dict, cs_v2)
dir_map = dict(zip(cs_v2.feature_name_, cs_v2.get_params()["monotone_constraints"]))
cs_feature_dict_v2["dir"] = [dir_map[k] for k in cs_feature_dict_v2.index.values]
cs_feature_dict_v2["rank"] = range(1, len(cs_feature_dict_v2)+1)

In [5]:
feature_dict_v1.head()

Unnamed: 0_level_0,description,long description,attr_grp,categorical,type,dir,rank
field_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
p13_iqf9540,IQF9540,Number of days since the most recent personal ...,Public Records/Inquiry,[9999],int,-1,1
p13_alj8120,ALJ8120,Number of months since the most recently opene...,Age/Recency,"[9999, 9998, 9996, 9994]",int,1,2
t11_tmti2752,,Total actual payment over the last 6 months on...,Payment magnitude,"[999999999, 999999998, 999999997, 999999996]",int,-1,3
p13_pil8132,PIL8132,Number of days since the most recently opened ...,Age/Recency,"[99999, 99998, 99996, 99994]",int,-1,4
t11_trev0722,,Difference between Q1 (m00-m03) total average ...,Payment variance,"[999999999, 999999998, 999999997, 999999996]",int,1,5


#### build aggregated ranking

In [6]:
fd = pd.concat([feature_dict_v1,
                feature_dict_v2,
                cs_feature_dict_v1,
                cs_feature_dict_v2
                ], axis=0)
fd.shape

(128, 7)

In [7]:
fts = fd.index.unique()
data_dict = exp_dict[["field_name",
                      "long description", "attr_grp", 
                      "categorical", "type"]]

In [8]:
fd_present = pd.DataFrame(index=fts)
fd_present = pd.merge(fd_present, feature_dict_v1["rank"].rename("prescreen_combined_v1"), left_index=True, right_index=True, how="left")
fd_present = pd.merge(fd_present, feature_dict_v2["rank"].rename("prescreen_combined_v2"), left_index=True, right_index=True, how="left")
fd_present = pd.merge(fd_present, cs_feature_dict_v1["rank"].rename("credit_scoring_combined_v1"), left_index=True, right_index=True, how="left")
fd_present = pd.merge(fd_present, cs_feature_dict_v2["rank"].rename("credit_scoring_combined_v2"), left_index=True, right_index=True, how="left")
fd_present = pd.merge(fd_present, data_dict, left_index=True, right_on="field_name")

In [9]:
cols = ["prescreen_combined_v1", "prescreen_combined_v2",
        "credit_scoring_combined_v1", "credit_scoring_combined_v2"]
fd_present["-freq"] = -(~fd_present.isna()).sum(axis=1)
fd_present["mean"] = fd_present[cols].mean(axis=1)
fd_present["min"] = fd_present[cols].min(axis=1)
fd_present.sort_values(["-freq", "min"], inplace=True)

cols = ["field_name", "prescreen_combined_v1", "prescreen_combined_v2",
        "credit_scoring_combined_v1", "credit_scoring_combined_v2", "long description", "attr_grp", 
        "categorical", "type"]
fd_present[cols].head()

Unnamed: 0,field_name,prescreen_combined_v1,prescreen_combined_v2,credit_scoring_combined_v1,credit_scoring_combined_v2,long description,attr_grp,categorical,type
13,p13_alj8120,2.0,3.0,4.0,3.0,Number of months since the most recently opene...,Age/Recency,"[9999, 9998, 9996, 9994]",int
3684,t11_tmti2752,3.0,10.0,3.0,5.0,Total actual payment over the last 6 months on...,Payment magnitude,"[999999999, 999999998, 999999997, 999999996]",int
4065,t11_trev0722,5.0,7.0,11.0,9.0,Difference between Q1 (m00-m03) total average ...,Payment variance,"[999999999, 999999998, 999999997, 999999996]",int
4005,t11_tiln2755,13.0,6.0,10.0,6.0,Ratio between total scheduled payment and tota...,Payment variance,"[9999, 9998, 9997, 9996]",int
3764,t11_tstu2752,11.0,13.0,19.0,7.0,Total actual payment over the last 6 months on...,Payment magnitude,"[999999999, 999999998, 999999997, 999999996]",int
