In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import (
    SequentialFeatureSelector,
)
from mlxtend.evaluate import feature_importance_permutation
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import matplotlib.ticker as ticker
import os
from pathlib import Path
import json

import sys

sys.path.append("scripts")

import utils
import distclassipy as dcpy

cd = dcpy.Distance()

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


TypeError: unsupported operand type(s) for |: 'type' and '_CallableType'

In [None]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
np.random.seed(settings_dict["seed_choice"])

classification_letter = "c"
classification_problem = settings_dict["classification_problem"][classification_letter]
classes_to_keep = settings_dict["classes_to_keep"][classification_letter]
results_subfolder = f"{classification_letter}. {classification_problem}"
sns_dict = settings_dict["sns_dict"]

sns.set_theme(**sns_dict)

In [None]:
#check_estimator(dcpy.DistanceMetricClassifier())  # passes

In [None]:
# Load Data
X_df_FULL = pd.read_csv("data/X_df.csv", index_col=0)
y_df_FULL = pd.read_csv("data/y_df.csv", index_col=0)

In [None]:
# Remove manually selected 'bad' features
with open(os.path.join("results", results_subfolder, "drop_features.txt")) as f:
    bad_features = json.load(f)  # manually selected

X_df_FULL = X_df_FULL.drop(bad_features, axis=1)

print(X_df_FULL.shape[1])

In [None]:
bad_features

In [None]:
# Keep only current classes
cl_keep_str = "_".join(classes_to_keep)

y_df = y_df_FULL[y_df_FULL["class"].isin(classes_to_keep)]
X_df = X_df_FULL.loc[y_df.index]
X = X_df.to_numpy()
y = y_df.to_numpy().ravel()

In [None]:
feats_top_plot = 15
### For plotting only - how many features to plot in relative importance

In [None]:
metric = "canberra"

In [None]:
metric_str = utils.get_metric_name(metric)

# Feature Importance via Feature Permutation

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y, random_state=settings_dict["seed_choice"]
)

---
## Option 1: Pass permutation feature importance

In [None]:
from sklearn.feature_selection import RFE

def custom_importance_getter(lcdc):

    imp_vals, imp_all = feature_importance_permutation(
        predict_method=lcdc.predict,
        X=X_test,
        y=y_test,
        metric="accuracy",
        num_rounds=10,
        seed=settings_dict["seed_choice"]
    )

    return imp_vals

In [None]:
lcdc = dcpy.DistanceMetricClassifier(
    metric=metric,
    #scale=False,
)

lcdc.fit(X_train, y_train)

lcdc.feature_importances_

In [None]:


rfe = RFE(estimator=lcdc, n_features_to_select=15, step=1, #importance_getter=custom_importance_getter,
          verbose=1)

rfe.fit(X_train, y_train)

ranking = rfe.ranking_
print("Ranking of features:", ranking)

In [None]:
lcdc = dcpy.DistanceMetricClassifier(
    metric=metric,
    scale=True,
)

lcdc.fit(X_train, y_train)

rfe = RFE(estimator=lcdc, n_features_to_select=1, step=1, importance_getter=custom_importance_getter,verbose=1)

rfe.fit(X_train, y_train)

ranking = rfe.ranking_
print("Ranking of features:", ranking)

###  ❌❌❌ : Did not work - it works for 31 features, and then sklearn automatically drops one feature - but the model doesn't know that this feature is dropped, and so leads to an error. Fixing this will require some deep diving into the distclassipy source code. 
Note that, ```check_estimator(lcdc)``` still works though - so as per sklearn, lcdc is still a valid classifier.

---
## Option 2: Pass an equal feature importance for all 31

In [None]:
def constant_importance_getter(lcdc):
    return np.ones(X.shape[1])

In [None]:
lcdc = dcpy.DistanceMetricClassifier(
    metric=metric,
    scale=False,
)

lcdc.fit(X_train, y_train)

rfe = RFE(estimator=lcdc, n_features_to_select=1, step=1, importance_getter=custom_importance_getter,verbose=1)

rfe.fit(X_train, y_train)

ranking = rfe.ranking_
print("Ranking of features:", ranking)

###  ❌❌❌ : Same problem