In [2]:
#!pip install threadpoolctl==3.1.0

In [3]:
#!pip install mlxtend

In [4]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import (
    SequentialFeatureSelector,
)
from mlxtend.evaluate import feature_importance_permutation
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import matplotlib.ticker as ticker
import os
from pathlib import Path
import json

import sys

sys.path.append("scripts")

import utils
import distclassipy as dcpy

cd = dcpy.Distance()

In [5]:
import warnings

warnings.filterwarnings('ignore')
# Ignore the specific RuntimeWarning

#warnings.filterwarnings(action='ignore', category=FutureWarning)
#warnings.filterwarnings(action='ignore', category=UserWarning)

In [6]:
# Load Data
X_df_FULL = pd.read_csv("data/X_df.csv", index_col=0)
y_df_FULL = pd.read_csv("data/y_df.csv", index_col=0)

In [7]:
y_df_FULL["class"].value_counts()

class
SR       558
DSCT     558
CEP      558
EW       558
RR       558
RSCVN    558
BYDra    558
RRc      558
Mira     558
EA       558
Name: count, dtype: int64

In [12]:
# Remove manually selected 'bad' features
with open(os.path.join("results", results_subfolder, "drop_features.txt")) as f:
    bad_features = json.load(f)  # manually selected

X_df_FULL = X_df_FULL.drop(bad_features, axis=1)

print(X_df_FULL.shape[1])

31


In [9]:
feats_top_plot = 15
### For plotting only - how many features to plot in relative importance

### Distance Based

In [10]:
all_metrics = [
    "euclidean",
    "braycurtis",
    "canberra",
    "cityblock",
    "chebyshev",
    cd.clark,
    "correlation",
    "cosine",
    cd.hellinger,
    cd.jaccard,
    cd.lorentzian,
    cd.marylandbridge,
    cd.meehl,
    cd.motyka,
    cd.soergel,
    cd.wave_hedges,
    cd.kulczynski,
    cd.add_chisq,
]

scoring = "f1_macro"
feats_to_keep = X_df_FULL.columns

# Change classification in the next cell

In [11]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
np.random.seed(settings_dict["seed_choice"])

classification_letter = "c"
classification_problem = settings_dict["classification_problem"][classification_letter]
classes_to_keep = settings_dict["classes_to_keep"][classification_letter]
results_subfolder = f"{classification_letter}. {classification_problem}"
sns_dict = settings_dict["sns_dict"]

sns.set_theme(**sns_dict)

In [13]:
# Keep only current classes
cl_keep_str = "_".join(classes_to_keep)

y_df = y_df_FULL[y_df_FULL["class"].isin(classes_to_keep)]
X_df = X_df_FULL.loc[y_df.index]
X = X_df.to_numpy()
y = y_df.to_numpy().ravel()

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for DistanceMetricClassifier
param_grid = {
    'metric': all_metrics,
    'scale_std': [True, False],
    'calculate_kde': [True, False],
    'calculate_1d_dist': [True, False]
}

# Initialize the classifier
distance_metric_classifier = dcpy.DistanceMetricClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=distance_metric_classifier, param_grid=param_grid, scoring='f1_macro', cv=5)

# Fit GridSearchCV
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score found: ", grid_search.best_score_)


Best parameters found:  {'calculate_1d_dist': True, 'calculate_kde': True, 'metric': 'canberra', 'scale_std': True}
Best F1 score found:  0.7850870990672079


In [16]:
dcpy.DistanceMetricClassifier(

distclassipy.classifier.DistanceMetricClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

# Confidence calculation
classification_letter = "b"


X_train, X_test, y_train, y_test = train_test_split(
    X_sfs, y, test_size=0.33, stratify=y, random_state=settings_dict["seed_choice"]
)

lcdc = dcpy.DistanceMetricClassifier()

parameters = {"metric":all_metrics, "scale_std":["True", "False"],
              "calculate_kde":["True", "False"], 
              "calculate_1d_dist":["True", "False"]}

clf = GridSearchCV(lcdc, parameters, scoring='f1_macro')
clf.fit(X_train, y_train)



In [34]:
pd.DataFrame(clf.cv_results_).sort_values("mean_test_score", ascending=False).to_csv(f"gridsearch_{classification_letter}.csv")