In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import distclassipy as dcpy
from tqdm.auto import tqdm

os.chdir("../")
from pathlib import Path
import matplotlib.gridspec as gridspec
import sys

sys.path.append("scripts")
import utils

In [2]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
seed_val = settings_dict["seed_choice"]
np.random.seed(seed_val)
sns_dict = settings_dict["sns_dict"]
sns.set_theme(**sns_dict)

In [3]:
# custom_hues = ["#3B4CC0", "#1FA187", "#FBAE17", "#D21F26"]
ibm_palette_hues = ["#648FFF", "#785EF0", "#DC267F", "#FE6100", "#FFB000"]
# https://davidmathlogic.com/colorblind/
sns.set_palette(ibm_palette_hues)

In [4]:
# all_metrics = settings_dict["all_metrics"]
all_metrics = dcpy._ALL_METRICS

In [5]:
reduced_features = pd.read_parquet("data/reduced_features.parquet")
reduced_features["class"].value_counts()

class
EB      98473
RRL     45096
DSCT     8245
CEP      1662
Name: count, dtype: int64

In [6]:
# n_cols = 5
n_objs = 1000

# 69% Features: ['Harmonics_phase_5_Y', 'Harmonics_mse_Y', 'Harmonics_mag_7_Y', 'Harmonics_phase_3_z', 'n_forced_phot_band_after_Y', 'Harmonics_phase_4_g', 'max_brightness_after_band_g', 'Harmonics_phase_2_z', 'Harmonics_phase_2_z', 'Harmonics_phase_3_z']
# 65% Features: ['Harmonics_phase_6_r', 'Power_rate_4', 'Harmonics_mag_5_Y', 'Harmonics_phase_3_g', 'r-i']
# 80% Features: ['Power_rate_4', 'Harmonics_mag_5_Y', 'Harmonics_phase_3_r', 'r-i']

col_select = ["Power_rate_4", "Harmonics_mag_5_Y", "r-i", "Multiband_period"]
print(f"Features: {col_select}")
temp = reduced_features.loc[:, col_select + ["class"]].dropna()
assert (temp["class"].value_counts() > n_objs).all()

Features: ['Power_rate_4', 'Harmonics_mag_5_Y', 'r-i', 'Multiband_period']


In [7]:
temp = temp.groupby("class").sample(n=n_objs)
temp = temp.sample(frac=1, random_state=seed_val)

In [8]:
X = temp[col_select].to_numpy()
y = temp["class"].to_numpy()

In [9]:
all_metric_dfs = {}
all_metric_preds = {}

for metric in tqdm(all_metrics, desc="Metric", leave=True):
    lcdc = dcpy.DistanceMetricClassifier(
        scale=True,
        central_stat="median",
        dispersion_stat="std",
    )
    metric_str = utils.get_metric_name(metric)
    lcdc.fit(X, y)
    _ = lcdc.predict_and_analyse(X, metric=metric)

    dist_df = lcdc.centroid_dist_df_
    all_metric_dfs[metric_str] = dist_df

    # Calculate preds for the current metric
    class_names = [col.replace("_dist", "") for col in dist_df.columns]
    # Use .to_numpy() for robust operation with np.argmin
    argmin_preds = np.argmin(dist_df.to_numpy(), axis=1)
    preds = np.array([class_names[idx] for idx in argmin_preds])
    all_metric_preds[metric_str] = preds

Metric:   0%|          | 0/43 [00:00<?, ?it/s]

In [10]:
# grouped_identical_dfs = []
# processed_metric_names = set()

# # Get a list of metric names to iterate over.
# metric_names_list = list(all_metric_dfs.keys())

# for i in range(len(metric_names_list)):
#     name1 = metric_names_list[i]

#     # If this metric has already been grouped with a previous one, skip it.
#     if name1 in processed_metric_names:
#         continue

#     current_group = [name1]
#     df1 = all_metric_dfs[name1]

#     # Compare with subsequent metrics in the list.
#     for j in range(i + 1, len(metric_names_list)):
#         name2 = metric_names_list[j]

#         # If this metric has already been grouped, skip it.
#         if name2 in processed_metric_names:
#             continue

#         df2 = all_metric_dfs[name2]

#         # DataFrame.equals() checks for identical content, shape, and dtypes.
#         if df1.equals(df2):
#             current_group.append(name2)
#             # Mark name2 as processed since it's now part of current_group.
#             processed_metric_names.add(name2)

#     # Add name1 to processed_metric_names as it's now the representative of 'current_group'
#     # (or forms a group of its own), preventing it from starting a new group.
#     processed_metric_names.add(name1)
#     grouped_identical_dfs.append(current_group)

# # Filter for groups that have more than one metric (i.e., actual duplicates).
# duplicate_groups = [group for group in grouped_identical_dfs if len(group) > 1]

# if duplicate_groups:
#     print("Metrics that produced identical centroid_dist_df_ DataFrames:")
#     for group in duplicate_groups:
#         print(f"- Metrics: {', '.join(group)}")
#         # As a quick check, you might want to see the shape of one of the identical DataFrames:
#         # print(f"  (Example DataFrame shape for this group: {all_metric_dfs[group[0]].shape})")
# else:
#     print("All metrics produced unique centroid_dist_df_ DataFrames.")

In [11]:
# # This code should be in a new cell, after the one modified above.
# # It assumes 'all_metric_preds' is populated with metric names as keys
# # and their corresponding prediction (preds) NumPy arrays as values.

# grouped_identical_preds = []
# processed_metric_names_for_preds = set()

# # Get a list of metric names to iterate over.
# metric_names_list_for_preds = list(all_metric_preds.keys())

# for i in range(len(metric_names_list_for_preds)):
#     name1 = metric_names_list_for_preds[i]
    
#     if name1 in processed_metric_names_for_preds:
#         continue

#     current_group_preds = [name1]
#     preds1 = all_metric_preds[name1]

#     for j in range(i + 1, len(metric_names_list_for_preds)):
#         name2 = metric_names_list_for_preds[j]
        
#         if name2 in processed_metric_names_for_preds:
#             continue

#         preds2 = all_metric_preds[name2]
        
#         # Compare NumPy arrays for equality in both shape and element values.
#         if np.array_equal(preds1, preds2):
#             current_group_preds.append(name2)
#             processed_metric_names_for_preds.add(name2)
    
#     processed_metric_names_for_preds.add(name1)
#     grouped_identical_preds.append(current_group_preds)

# # Filter for groups that have more than one metric (i.e., actual identical predictions).
# duplicate_preds_groups = [group for group in grouped_identical_preds if len(group) > 1]

# if duplicate_preds_groups:
#     print("\nMetrics that produced identical 'preds' arrays:")
#     for group in duplicate_preds_groups:
#         print(f"- Metrics: {', '.join(group)}")
#         # You could also print the length of the preds array for one of them as a check
#         # if all_metric_preds[group[0]] is not None:
#         #     print(f"  (Example 'preds' array length for this group: {len(all_metric_preds[group[0]])})")
# else:
#     print("\nAll metrics produced unique 'preds' arrays.")

In [12]:
# This code should be in a new cell, after the one that populates 'all_metric_dfs'.
# It assumes 'all_metric_dfs' contains the original (unnormalized) DataFrames.

import numpy as np # Ensure numpy is imported

def normalize_dataframe_columns(df):
    """Applies Min-Max normalization to each column of a DataFrame."""
    if df.empty:
        return df
    df_norm = df.copy()
    for column in df_norm.columns:
        min_val = df_norm[column].min()
        max_val = df_norm[column].max()
        if max_val - min_val == 0:
            # If all values in a column are the same, scale to 0
            # (or you could choose 0.5 or 1, depending on desired behavior for constant columns)
            df_norm[column] = 0.0
        else:
            df_norm[column] = (df_norm[column] - min_val) / (max_val - min_val)
    return df_norm

grouped_identical_normalized_dfs = []
processed_metric_names_norm = set()

metric_names_list_norm = list(all_metric_dfs.keys())

for i in range(len(metric_names_list_norm)):
    name1 = metric_names_list_norm[i]
    
    if name1 in processed_metric_names_norm:
        continue

    current_group_norm = [name1]
    df1_original = all_metric_dfs[name1]
    
    # Normalize df1 before comparison
    # Ensure df1_original is not empty and has numeric types suitable for normalization
    if df1_original.empty or not all(pd.api.types.is_numeric_dtype(df1_original[col]) for col in df1_original.columns):
        # Skip problematic dataframes or handle as appropriate
        # For now, let's print a warning and skip adding it to any group.
        # print(f"Warning: DataFrame for metric '{name1}' is empty or non-numeric, skipping normalization.")
        # processed_metric_names_norm.add(name1) # Mark as processed
        # grouped_identical_normalized_dfs.append(current_group_norm) # Add it as a group of one
        # continue # Or handle differently based on how you want to treat such cases
        # For this implementation, we'll try to normalize and let it fail if types are wrong,
        # or handle specific cases in normalize_dataframe_columns if needed.
        # Given centroid_dist_df_ should be numeric, this might be an edge case.
        pass

    norm_df1 = normalize_dataframe_columns(df1_original.copy()) # Operate on a copy

    for j in range(i + 1, len(metric_names_list_norm)):
        name2 = metric_names_list_norm[j]
        
        if name2 in processed_metric_names_norm:
            continue

        df2_original = all_metric_dfs[name2]
        if df2_original.empty or not all(pd.api.types.is_numeric_dtype(df2_original[col]) for col in df2_original.columns):
            # Similar handling for df2
            # print(f"Warning: DataFrame for metric '{name2}' is empty or non-numeric, skipping normalization.")
            # continue
            pass
            
        norm_df2 = normalize_dataframe_columns(df2_original.copy()) # Operate on a copy

        # Compare shapes first, then use np.allclose for numerical arrays
        if norm_df1.shape == norm_df2.shape and \
           np.allclose(norm_df1.to_numpy(dtype=float), norm_df2.to_numpy(dtype=float), equal_nan=True): # Added equal_nan=True
            current_group_norm.append(name2)
            processed_metric_names_norm.add(name2)
    
    processed_metric_names_norm.add(name1)
    grouped_identical_normalized_dfs.append(current_group_norm)

duplicate_normalized_groups = [group for group in grouped_identical_normalized_dfs if len(group) > 1]

if duplicate_normalized_groups:
    print("Metrics that produced identical DataFrames AFTER column-wise Min-Max normalization:")
    for group in duplicate_normalized_groups:
        print(f"- Metrics: {', '.join(group)}")
else:
    print("All metrics produced unique DataFrames after column-wise Min-Max normalization.")

Metrics that produced identical DataFrames AFTER column-wise Min-Max normalization:
- Metrics: Euclidean, Minkowski
- Metrics: Braycurtis, Motyka, Czekanowski, Sorensen
- Metrics: Cityblock, Gower
- Metrics: Hellinger, Matusita
- Metrics: Soergel, Ruzicka, Tanimoto
- Metrics: Jensenshannon_Divergence, Jensen_Difference, Topsoe
- Metrics: Prob_Chisq, Squared_Chisq


In [13]:
# dcpy.distances._ALL_METRICS

In [14]:
subset_metrics = ['euclidean',
 'braycurtis',
 'canberra',
 'cityblock',
 'chebyshev',
 'clark',
 'correlation',
 'cosine',
 'hellinger',
 'jaccard',
 'lorentzian',
 'marylandbridge',
 'meehl',
 # 'motyka',
 'soergel',
 'wave_hedges',
 'kulczynski',
 'add_chisq',
 'acc',
 'chebyshev_min',
 # 'czekanowski',
 'dice',
 'divergence',
 'google',
 # 'gower',
 'jeffreys',
 # 'jensenshannon_divergence',
 # 'jensen_difference',
 'kumarjohnson',
 # 'matusita',
 # 'minkowski',
 'penroseshape',
 # 'prob_chisq',
 # 'ruzicka',
 # 'sorensen',
 'squared_chisq',
 'squaredchord',
 'squared_euclidean',
 'taneja',
 # 'tanimoto',
 'topsoe',
 'vicis_symmetric_chisq',
 'vicis_wave_hedges']
