In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif
from sklearn.preprocessing import LabelEncoder
from feature_engine.selection import SelectByInformationValue
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
# import eli5

class FeatureSelector:
    # For Classification problem
    def __init__(self):
        self.mi = None
        self.fisher = None
        self.chi = None
        self.iv = None
        self.shap = None
        # self.eli5 = None

    def mutual_information(self, X, y):
        mi = mutual_info_classif(X, y)
        mi_series = pd.Series(mi, index=X.columns, name="Mutual Information")
        mi_series = mi_series.sort_values(ascending=False)
        return mi_series

    def chi_square(self, X, y):
        chi, _ = chi2(X, y)
        chi_series = pd.Series(chi, index=X.columns, name="Chi-Square")
        chi_series = chi_series.sort_values(ascending=False)
        return chi_series

    def fisher_score(self, X, y):
        F, _ = f_classif(X, y)
        fisher_score_series = pd.Series(F, index=X.columns, name="Fisher Score")
        fisher_score_series = fisher_score_series.sort_values(ascending=False)
        return fisher_score_series

    def information_value(self, X, y):
        iv = SelectByInformationValue()
        iv.fit(X, y)
        iv_series = pd.Series(iv.information_values_, index=X.columns, name="Information Value")
        iv_series = iv_series.sort_values(ascending=False)
        return iv_series

    def FI_with_shap(self, X, y):
        categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
        train_pool = Pool(X, y, cat_features = categorical_features)
        if isinstance(y, np.ndarray) and np.issubdtype(y.dtype, np.number):
            estimator = CatBoostRegressor(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
        else:
            estimator = CatBoostClassifier(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
        model = estimator.fit(train_pool)   
        shap_series = pd.Series(model.get_feature_importance(train_pool,), X.columns)
        return shap_series
    
    
    # def FI_with_eli5(self, X, y):
    #     categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
    #     train_pool = Pool(X, y, cat_features = categorical_features)
    #     if isinstance(y, np.ndarray) and np.issubdtype(y.dtype, np.number):
    #         estimator = CatBoostRegressor(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
    #     else:
    #         estimator = CatBoostClassifier(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
    #     model = estimator.fit(train_pool)
    #     Series = eli5.explain_weights_catboost(catb = model,
    #                  pool = train_pool,
    #                  )
    #     Series_ = eli5.formatters.as_dataframe.format_as_dataframe(Series)
    #     eli5_series = pd.Series(data=Series_['weight'].values, index=Series_['feature'])
    #     eli5_series.name = None
    #     eli5_series.index.name = None
    #     return eli5_series

    def fit(self, X, y):
        print("Calculating Fisher Score for numerical features...")
        # Separate numerical and categorical features
        numerical_features = X.select_dtypes(include=[np.number])
        categorical_features = X.select_dtypes(exclude=[np.number])

        if not numerical_features.empty:
            self.fisher = self.fisher_score(numerical_features, y)
        else:
            self.fisher = pd.Series([], name="Fisher Score")

        if not categorical_features.empty:
            print("Calculating Mutual Information and Chi-Square for categorical features...")
            # Encode categorical variables
            X_encoded = categorical_features.apply(LabelEncoder().fit_transform)
            self.mi = self.mutual_information(X_encoded, y)
            self.chi = self.chi_square(X_encoded, y)
        else:
            self.mi = pd.Series([], name="Mutual Information")
            self.chi = pd.Series([], name="Chi-Square")

        # Calculate Information Value
        self.iv = self.information_value(X, y)
        # Calculate Shap Value
        self.shap = self.FI_with_shap(X, y)
        # Calculate Shap Value
        # self.eli5 = self.FI_with_eli5(X, y)

        # Creating a DataFrame to compile all results
        results = pd.DataFrame(index=X.columns)
        results['Fisher Score'] = self.fisher
        results['Mutual Information'] = self.mi
        results['Chi-Square'] = self.chi
        results['Information Value'] = self.iv
        results['Shap Value'] = self.shap
        results['Eli5 Value'] = self.eli5


        # Ranking features based on each method
        results['Fisher Rank'] = results['Fisher Score'].rank(ascending=False, method='min')
        results['MI Rank'] = results['Mutual Information'].rank(ascending=False, method='min')
        results['Chi Rank'] = results['Chi-Square'].rank(ascending=False, method='min')
        results['IV Rank'] = results['Information Value'].rank(ascending=False, method='min')
        results['Shap Rank'] = results['Shap Value'].rank(ascending=False, method='min')
        # results['Eli5 Rank'] = results['Eli5 Value'].rank(ascending=False, method='min')

        # Aggregating the ranks to get a combined rank
        results['Average Rank'] = results[['Fisher Rank', 'MI Rank', 'Chi Rank', 'IV Rank', 'Shap Rank', 
                                           # 'Eli5 Rank'
                                          ]].mean(axis=1)
        results = results.sort_values('Average Rank')
        return results

In [7]:
import numpy as np
import pandas as pd

# Example dataset
data = pd.DataFrame({
    'feature1': np.random.choice(['A', 'B', 'C'], 100),
    'feature2': np.random.choice(['X', 'Y', 'Z'], 100),
    'feature3': np.random.rand(100),
    'feature4': np.random.rand(100),
    'target': np.random.randint(0, 2, 100)
})

# Convert categorical features to category dtype
data['feature1'] = data['feature1'].astype('category')
data['feature2'] = data['feature2'].astype('category')

X = data.drop(columns=['target'])
y = data['target']

# Initialize and fit the feature selector
selector = FeatureSelector()
ranking_df = selector.fit(X, y)
ranking_df

Calculating Fisher Score for numerical features...
Calculating Mutual Information and Chi-Square for categorical features...
Unexpected exception formatting exception. Falling back to standard exception


  pos = y.groupby(X[variable]).sum() / total_pos
  neg = inverse_y.groupby(X[variable]).sum() / total_neg
Traceback (most recent call last):
  File "C:\Users\pasul\.conda\envs\myenv2\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\pasul\AppData\Local\Temp\ipykernel_12024\383751245.py", line 22, in <module>
    ranking_df = selector.fit(X, y)
  File "C:\Users\pasul\AppData\Local\Temp\ipykernel_12024\3038067178.py", line 97, in fit
    self.shap = self.FI_with_shap(X, y)
  File "C:\Users\pasul\AppData\Local\Temp\ipykernel_12024\3038067178.py", line 46, in FI_with_shap
    train_pool = Pool(X, y, cat_features = categorical_features)
  File "C:\Users\pasul\.conda\envs\myenv2\lib\site-packages\catboost\core.py", line 790, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  File "C:\Users\pasul\.conda\envs\myenv2\lib

In [None]:
# eli5.formatters.as_dataframe.format_as_dataframe()
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
import catboost
estimator = CatBoostClassifier(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
train_pool = Pool(X, y, cat_features = categorical_features)

estimator.fit(train_pool)

shap_values = estimator.get_feature_importance(
                        data=train_pool,
                       reference_data=None,
                       type=catboost.EFstrType.ShapValues,
                       prettified=True,
                       thread_count=-1,
                       verbose=False,
                       )

shap_values,  np.abs(shap_values).mean(0)

In [None]:
X

In [None]:
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(estimator, X, y, n_repeats=10, random_state=1066)
sorted_idx = perm_importance.importances_mean.argsort()
sorted_idx

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), perm_importance.importances_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X.columns)[sorted_idx])
plt.title('Permutation Importance')

In [6]:
pip uninstall shap

Note: you may need to restart the kernel to use updated packages.




In [None]:
import shap
estimator = CatBoostClassifier(iterations=500, max_depth=5, learning_rate=0.05, random_seed=1066, logging_level='Silent')
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
train_pool = Pool(X, y, cat_features = categorical_features)
estimator.fit(train_pool)
explainer = shap.Explainer(estimator)
shap_values = explainer(X)
shap_importance = shap_values.abs.mean(0).values
sorted_idx = shap_importance.argsort()
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X.columns)[sorted_idx])
plt.title('SHAP Importance')

ModuleNotFoundError: No module named 'shap'

In [None]:
import shap
import numpy as np
from sklearn.ensemble import RandomForestRegressor
shap.TreeExplainer(RandomForestRegressor(max_depth=4, n_estimators=10).fit(
    np.random.normal(size=(30, 6)), np.random.normal(size=(30,))))

In [4]:
pip install numpy==1.19.3 shap==0.36

Collecting numpy==1.19.3
  Downloading numpy-1.19.3-cp38-cp38-win_amd64.whl (13.3 MB)
Collecting shap==0.36
  Downloading shap-0.36.0-cp38-cp38-win_amd64.whl (370 kB)
Collecting numba
  Using cached numba-0.58.1-cp38-cp38-win_amd64.whl (2.6 MB)
Collecting slicer
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Collecting numba
  Downloading numba-0.58.0-cp38-cp38-win_amd64.whl (2.6 MB)
  Downloading numba-0.57.1-cp38-cp38-win_amd64.whl (2.6 MB)
Collecting importlib-metadata
  Using cached importlib_metadata-8.0.0-py3-none-any.whl (24 kB)
Collecting numba
  Downloading numba-0.57.0-cp38-cp38-win_amd64.whl (2.6 MB)
  Downloading numba-0.56.4-cp38-cp38-win_amd64.whl (2.5 MB)
Collecting llvmlite<0.40,>=0.39.0dev0
Note: you may need to restart the kernel to use updated packages.  Downloading llvmlite-0.39.1-cp38-cp38-win_amd64.whl (23.2 MB)



ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\programdata\\miniconda3\\envs\\myenv\\lib\\site-packages\\numpy-1.21.5.dist-info\\direct_url.json'
Consider using the `--user` option or check the permissions.



Installing collected packages: numpy, llvmlite, importlib-metadata, slicer, numba, shap
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:


In [None]:
pip uninstall pyzmq

In [None]:
pip install pyzmq==19.0.2