In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import distclassipy as dcpy

os.chdir("../")
from pathlib import Path
import matplotlib.gridspec as gridspec
import sys
from tqdm.auto import tqdm
sys.path.append("scripts")
import utils

In [2]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
seed_val = settings_dict["seed_choice"]
np.random.seed(seed_val)
sns_dict = settings_dict["sns_dict"]
sns.set_theme(**sns_dict)

In [3]:
# custom_hues = ["#3B4CC0", "#1FA187", "#FBAE17", "#D21F26"]
ibm_palette_hues = ["#648FFF", "#785EF0", "#DC267F", "#FE6100", "#FFB000"]
# https://davidmathlogic.com/colorblind/
sns.set_palette(ibm_palette_hues)

In [4]:
features = pd.read_parquet("data/reduced_balancedfeatures_LATEST.parquet")
features["class"].value_counts()

class
CEP     683
DSCT    683
EB      683
RRL     683
Name: count, dtype: int64

In [5]:
features = features.sample(frac=1)

In [6]:
final_features = ['SPM_A_Y',
 'Multiband_period',
 'r-i',
 'Harmonics_phase_4_i',
 'Harmonics_phase_2_r',
 'Power_rate_4']

In [7]:
X_df = features.loc[:,final_features]
y_df = features["class"]

In [8]:
X = X_df.to_numpy()
y = y_df.to_numpy()

In [12]:
all_metrics = dcpy._ALL_METRICS

---

In [15]:
all_metric_dfs = {}
all_metric_preds = {}

for metric in tqdm(all_metrics, desc="Metric", leave=True):
    lcdc = dcpy.DistanceMetricClassifier(
        scale=True,
        central_stat="median",
        dispersion_stat="std",
    )
    metric_str = utils.get_metric_name(metric)
    lcdc.fit(X, y)
    _ = lcdc.predict_and_analyse(X, metric=metric)

    dist_df = lcdc.centroid_dist_df_
    all_metric_dfs[metric_str] = dist_df

    # Calculate preds for the current metric
    class_names = [col.replace("_dist", "") for col in dist_df.columns]
    # Use .to_numpy() for robust operation with np.argmin
    argmin_preds = np.argmin(dist_df.to_numpy(), axis=1)
    preds = np.array([class_names[idx] for idx in argmin_preds])
    all_metric_preds[metric_str] = preds

Metric:   0%|          | 0/43 [00:00<?, ?it/s]

In [16]:
def normalize_dataframe_columns(df):
    """Applies Min-Max normalization to each column of a DataFrame."""
    if df.empty:
        return df
    df_norm = df.copy()
    for column in df_norm.columns:
        min_val = df_norm[column].min()
        max_val = df_norm[column].max()
        if max_val - min_val == 0:
            # If all values in a column are the same, scale to 1
            df_norm[column] = 1
        else:
            df_norm[column] = (df_norm[column] - min_val) / (max_val - min_val)
    return df_norm

grouped_identical_normalized_dfs = []
processed_metric_names_norm = set()

metric_names_list_norm = list(all_metric_dfs.keys())

for i in range(len(metric_names_list_norm)):
    name1 = metric_names_list_norm[i]
    
    if name1 in processed_metric_names_norm:
        continue

    current_group_norm = [name1]
    df1_original = all_metric_dfs[name1]
    
    # Normalize df1 before comparison
    norm_df1 = normalize_dataframe_columns(df1_original.copy()) # Operate on a copy

    for j in range(i + 1, len(metric_names_list_norm)):
        name2 = metric_names_list_norm[j]
        
        if name2 in processed_metric_names_norm:
            continue

        df2_original = all_metric_dfs[name2]
        norm_df2 = normalize_dataframe_columns(df2_original.copy())

        # Compare shapes first, then use np.allclose for numerical arrays
        if np.allclose(norm_df1.to_numpy(dtype=float), norm_df2.to_numpy(dtype=float), equal_nan=True):
            current_group_norm.append(name2)
            processed_metric_names_norm.add(name2)
    
    processed_metric_names_norm.add(name1)
    grouped_identical_normalized_dfs.append(current_group_norm)

duplicate_normalized_groups = [group for group in grouped_identical_normalized_dfs if len(group) > 1]

if duplicate_normalized_groups:
    print("Metrics that produced identical DataFrames AFTER column-wise Min-Max normalization:")
    for group in duplicate_normalized_groups:
        print(f"- Metrics: {', '.join(group)}")
else:
    print("All metrics produced unique DataFrames after column-wise Min-Max normalization.")

Metrics that produced identical DataFrames AFTER column-wise Min-Max normalization:
- Metrics: Euclidean, Minkowski
- Metrics: Braycurtis, Motyka, Czekanowski, Sorensen
- Metrics: Cityblock, Gower
- Metrics: Hellinger, Matusita
- Metrics: Soergel, Ruzicka, Tanimoto
- Metrics: Jensenshannon_Divergence, Jensen_Difference, Topsoe
- Metrics: Prob_Chisq, Squared_Chisq


In [13]:
# dcpy.distances._ALL_METRICS

In [21]:
throw_metrics = ['minkowski', 'motyka', 'czekanowski', 'sorensen', 'gower', 'matusita', 'ruzicka', 'tanimoto', 'jensenshannon_divergence', 'jensen_difference', 'prob_chisq']
unique_metrics = []
for metric in all_metrics:
    if metric not in throw_metrics:
        unique_metrics.append(metric)

In [22]:
unique_metrics

['euclidean',
 'braycurtis',
 'canberra',
 'cityblock',
 'chebyshev',
 'clark',
 'correlation',
 'cosine',
 'hellinger',
 'jaccard',
 'lorentzian',
 'marylandbridge',
 'meehl',
 'soergel',
 'wave_hedges',
 'kulczynski',
 'add_chisq',
 'acc',
 'chebyshev_min',
 'dice',
 'divergence',
 'google',
 'jeffreys',
 'kumarjohnson',
 'penroseshape',
 'squared_chisq',
 'squaredchord',
 'squared_euclidean',
 'taneja',
 'topsoe',
 'vicis_symmetric_chisq',
 'vicis_wave_hedges']