In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from tqdm.auto import tqdm

os.chdir("../")
import json

import sys

sys.path.append("scripts")
from pathlib import Path

In [2]:
import utils
import distclassipy as dcpy

In [3]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
seed_val = settings_dict["seed_choice"]
np.random.seed(seed_val)
sns_dict = settings_dict["sns_dict"]
sns.set_theme(**sns_dict)

In [4]:
all_metrics = settings_dict["all_metrics"]
scoring = "f1_macro"

In [5]:
features = pd.read_parquet("data/reduced_balancedfeatures_LATEST.parquet")
features["class"].value_counts()

class
CEP     683
DSCT    683
EB      683
RRL     683
Name: count, dtype: int64

In [6]:
features = features.sample(frac=1)

In [7]:
y_df = features["class"]
X_df = features.drop(["Coordinate_x", "Coordinate_y", "Coordinate_z", "class"], axis=1)

In [8]:
y = y_df.to_numpy()
X = X_df.to_numpy()

## 1. Superset of important features
These are in the "best" feature list for more than 2 metrics.



In [9]:
good_feats = []
for metric in all_metrics:
    metric_str = utils.get_metric_name(metric)
    locpath = os.path.join("results", metric_str)

    sfs_df = pd.read_csv(os.path.join(locpath, "sfs_allfeatures.csv"), index_col=0)
    feats_idx, feats = utils.load_best_features(sfs_df)
    print(f"{metric_str}: {feats}")
    good_feats = good_feats + feats

Euclidean: ['g-r', 'r-i', 'Period_band_i', 'Power_rate_4', 'Harmonics_phase_2_r', 'Harmonics_phase_2_i', 'Harmonics_phase_6_z', 'SPM_A_Y', 'SPM_chi_r', 'positive_fraction_z']
Braycurtis: ['r-i', 'i-z', 'Multiband_period', 'Harmonics_mag_6_i', 'Period_band_i', 'delta_period_i', 'Power_rate_4', 'Harmonics_phase_2_i', 'Harmonics_phase_4_i', 'Harmonics_phase_2_z', 'SPM_A_g', 'SPM_A_r', 'SPM_tau_fall_r', 'SPM_A_i', 'SPM_A_Y', 'SPM_chi_i']
Canberra: ['i-z', 'Multiband_period', 'Harmonics_phase_2_r', 'Harmonics_phase_6_r', 'Harmonics_phase_2_i', 'Harmonics_phase_4_i', 'Harmonics_phase_2_z', 'SPM_A_Y']
Cityblock: ['g-r', 'i-z', 'Period_band_i', 'Power_rate_2', 'Harmonics_phase_2_r', 'Harmonics_phase_2_i', 'Harmonics_phase_2_z', 'SPM_A_Y', 'SPM_chi_z']
Chebyshev: ['g-r', 'i-z', 'Period_band_i', 'delta_period_i', 'Power_rate_4', 'Harmonics_phase_2_i', 'SPM_A_z', 'SPM_A_Y']
Clark: ['r-i', 'Multiband_period', 'Power_rate_1_4', 'Harmonics_phase_2_r', 'Harmonics_phase_4_r', 'Harmonics_phase_2_i', 'H

In [10]:
topfeats = pd.Series(np.array(good_feats))
topfeats.value_counts()

Harmonics_phase_2_i    14
SPM_A_Y                14
Multiband_period       13
r-i                    11
i-z                    10
Period_band_i          10
Harmonics_phase_2_z     9
Harmonics_phase_4_i     8
delta_period_i          8
g-r                     8
Harmonics_phase_2_r     8
Power_rate_4            6
SPM_A_g                 6
SPM_tau_fall_r          6
SPM_A_i                 6
SPM_chi_i               5
Harmonics_phase_4_z     4
Harmonics_mag_6_i       4
SPM_A_r                 4
Harmonics_phase_4_r     3
Power_rate_1_2          3
SPM_chi_z               3
SPM_chi_r               3
SPM_A_z                 2
Power_rate_1_4          2
Harmonics_phase_6_z     2
PPE                     2
SPM_gamma_i             2
SPM_tau_fall_z          2
SPM_tau_fall_i          2
Power_rate_2            1
Harmonics_phase_7_z     1
SPM_gamma_g             1
Harmonics_phase_7_i     1
SPM_t0_Y                1
SPM_tau_fall_Y          1
SPM_tau_fall_g          1
Harmonics_chi_i         1
Harmonics_ph

In [11]:
superset_feats = topfeats.value_counts().iloc[:12].index
superset_feats

Index(['Harmonics_phase_2_i', 'SPM_A_Y', 'Multiband_period', 'r-i', 'i-z',
       'Period_band_i', 'Harmonics_phase_2_z', 'Harmonics_phase_4_i',
       'delta_period_i', 'g-r', 'Harmonics_phase_2_r', 'Power_rate_4'],
      dtype='object')

In [12]:
X_df = X_df.loc[:, superset_feats]
X = X_df.to_numpy()

In [14]:
for metric in tqdm(all_metrics, desc="Metric", leave=True):
    metric_str = utils.get_metric_name(metric)
    locpath = os.path.join("results", metric_str)
    print("*" * 20, metric_str, "*" * 20)
    Path(locpath).mkdir(parents=True, exist_ok=True)

    lcdc = dcpy.DistanceMetricClassifier(
        scale=True,
        central_stat="median",
        dispersion_stat="std",
        metric=metric,
    )
    # Exhaustive Feature Selection

    feat_selector = EFS(
        lcdc,
        min_features=1,
        max_features=len(superset_feats),
        scoring=scoring,
        print_progress=False,
        n_jobs=-1,
    ).fit(X_df, y_df)
    print(f"Best F1 score: {feat_selector.best_score_:.2%}")
    print(
        f"Best subset ({len(feat_selector.best_idx_)} indices): {sorted(list(feat_selector.best_feature_names_))}"
    )

    res_df = pd.DataFrame.from_dict(feat_selector.get_metric_dict()).T
    res_df["avg_score"] = res_df["avg_score"].astype("float")
    res_df = res_df.sort_values("avg_score", ascending=False).reset_index(drop=True)
    res_df.to_csv(f"{locpath}/efs_allfeatures.csv", index=False)

Metric:   0%|          | 0/18 [00:00<?, ?it/s]

******************** Euclidean ********************
Best F1 score: 93.75%
Best subset (8 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_2_z', 'Multiband_period', 'Power_rate_4', 'SPM_A_Y', 'g-r', 'r-i']
******************** Braycurtis ********************
Best F1 score: 93.38%
Best subset (8 indices): ['Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Multiband_period', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'delta_period_i', 'i-z']
******************** Canberra ********************
Best F1 score: 94.52%
Best subset (7 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Multiband_period', 'SPM_A_Y', 'r-i']
******************** Cityblock ********************
Best F1 score: 94.70%
Best subset (9 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_2_z', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'g-r', 'i-z', 'r-i']
******************** Chebyshev ********************
Best F1 score: 90.8

  return 1 - (uv / (np.dot(u, u) + np.dot(v, v) - uv))
  return 1 - (uv / (np.dot(u, u) + np.dot(v, v) - uv))
  return 1 - (uv / (np.dot(u, u) + np.dot(v, v) - uv))
  return 1 - (uv / (np.dot(u, u) + np.dot(v, v) - uv))
  return 1 - (uv / (np.dot(u, u) + np.dot(v, v) - uv))


Best F1 score: 90.80%
Best subset (5 indices): ['Harmonics_phase_4_i', 'Multiband_period', 'Power_rate_4', 'SPM_A_Y', 'g-r']
******************** Lorentzian ********************
Best F1 score: 93.99%
Best subset (9 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'g-r', 'r-i']
******************** Marylandbridge ********************


  return 1 - (uvdot / np.dot(u, u) + uvdot / np.dot(v, v)) / 2
  return 1 - (uvdot / np.dot(u, u) + uvdot / np.dot(v, v)) / 2
  return 1 - (uvdot / np.dot(u, u) + uvdot / np.dot(v, v)) / 2
  return 1 - (uvdot / np.dot(u, u) + uvdot / np.dot(v, v)) / 2
  return 1 - (uvdot / np.dot(u, u) + uvdot / np.dot(v, v)) / 2


Best F1 score: 41.27%
Best subset (4 indices): ['Harmonics_phase_4_i', 'SPM_A_Y', 'delta_period_i', 'g-r']
******************** Meehl ********************
Best F1 score: 92.58%
Best subset (9 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_2_z', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'delta_period_i', 'i-z', 'r-i']
******************** Motyka ********************


  return np.sum(np.maximum(u, v)) / np.sum(u + v)
  return np.sum(np.maximum(u, v)) / np.sum(u + v)
  return np.sum(np.maximum(u, v)) / np.sum(u + v)
  return np.sum(np.maximum(u, v)) / np.sum(u + v)
  return np.sum(np.maximum(u, v)) / np.sum(u + v)


Best F1 score: 93.38%
Best subset (8 indices): ['Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Multiband_period', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'delta_period_i', 'i-z']
******************** Soergel ********************


  return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.maximum(u, v))


Best F1 score: 93.38%
Best subset (8 indices): ['Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Multiband_period', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'delta_period_i', 'i-z']
******************** Wave_Hedges ********************
Best F1 score: 93.49%
Best subset (6 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Harmonics_phase_4_i', 'Multiband_period', 'SPM_A_Y', 'r-i']
******************** Kulczynski ********************


  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))
  return np.sum(np.abs(u - v)) / np.sum(np.minimum(u, v))


Best F1 score: 93.38%
Best subset (8 indices): ['Harmonics_phase_2_z', 'Harmonics_phase_4_i', 'Multiband_period', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'delta_period_i', 'i-z']
******************** Add_Chisq ********************
Best F1 score: 93.30%
Best subset (9 indices): ['Harmonics_phase_2_i', 'Harmonics_phase_2_r', 'Multiband_period', 'Period_band_i', 'Power_rate_4', 'SPM_A_Y', 'g-r', 'i-z', 'r-i']
