In [1]:
import sys
sys.path.append("../code")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, log_loss, mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from IPython.display import display


sns.set_style("darkgrid")
sns.set_context("paper", font_scale=1.5)

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### target encoding


In [None]:
n_split = 10
target_col = "municipalities_name_category"

kfold = KFold(n_splits=n_split, shuffle=True)

mean_col = f"target_enc_of_{target_col}_{n_split}folds"
train[mean_col] = np.nan

for desc_idx, target_idx in kfold.split(train):
    desc_data = train.iloc[desc_idx]
    encoder_map = desc_data.groupby([target_col]).mean()[label].to_dict()
    
    train.loc[encoded_data.index[target_idx], mean_col] = \
        train.loc[encoded_data.index[target_idx], target_col].map(lambda x : encoder_map.get(x))

### null importance


In [2]:
def get_null_importace(x:pd.DataFrame, y:pd.DataFrame
                       , tree_model, params:dict, shuffule:bool=False):
    if shuffule:
        target = np.random.permutation(y.copy())
    else :
        target = y.copy()
        
    learning_param = params.copy()
    
    #モデルの作成
    tree_clf = tree_model(
        name=f"temp"
        , params=params
    )

    tree_clf.train(
        tr_x=x
        , tr_y=target
    )
    
    fi = tree_clf.feature_importance()
    
    return fi

In [None]:
n_trials = 1000
base_fi = get_null_importace(train[feature_columns], y=train[label]
                             , tree_model=ModelXgb, params=XGB_PARAMS, shuffule=False)

null_imp_result = base_fi.copy()
null_imp_result.columns = ["feature_name", "base"]

for idx in tqdm(range(n_trials)):
    null_fi = get_null_importace(train[feature_columns], y=train[label]
                             , tree_model=ModelXgb, params=XGB_PARAMS, shuffule=True)
    
    null_fi.columns = ["feature_name", f"null_imp_{idx}"]
    
    null_imp_result = null_imp_result.merge(null_fi, on="feature_name")

In [None]:
th = 0.8
n_features = len(null_imp_result)
null_importance_columns = [f"null_imp_{idx}" for idx in range(n_trials)]

null_imp_result["th_importance"] = null_imp_result.iloc[:, 2:].quantile(th, axis=1)

null_imp_result = null_imp_result.sort_values("base", ascending=False)

In [None]:
effective_columns = null_imp_result[null_imp_result["base"] >= null_imp_result["th_importance"]]["feature_name"].values
not_effective_columns = null_imp_result[null_imp_result["base"] < null_imp_result["th_importance"]]["feature_name"].values

if n_features <= 4:
    fig, axs = plt.subplots(nrows=1, ncols=n_features, figsize=(n_features*6, 6))
    if n_features == 1:
        axs = np.array([ax])
else :
    nrows = int(n_features // 4) + int(n_features % 4 > 1)
    fig, axs = plt.subplots(nrows=nrows, ncols=4, figsize=(4*6, nrows*6))

axs = axs.flatten()

for idx, ax in enumerate(axs):
    if idx == n_features:
        break
    target_row = null_imp_result.iloc[idx]
    hist_val = ax.hist(target_row[null_importance_columns], color="darkorange", label="null importance")[0]
    ax.vlines(target_row["th_importance"], 0, max(hist_val), color="red", linestyle="--", label="threthold")
    #baseline
    ax.vlines(target_row["base"], 0, max(hist_val), color="blue", label="base line")
    
    #labels
    ax.set_xlabel("Feature importance")
    ax.set_ylabel("Frequency")
    ax.set_title(target_row["feature_name"])
    

plt.tight_layout()
plt.show()
plt.close()