### 0. Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

os.chdir("../")
from scripts import utils
from pathlib import Path
import matplotlib.gridspec as gridspec
from tqdm.auto import tqdm

In [2]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
from mlxtend.evaluate import feature_importance_permutation
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.utils.estimator_checks import check_estimator
from mlxtend.feature_selection import (
    SequentialFeatureSelector,
)
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import matplotlib.ticker as ticker
import distclassipy as dcpy

In [4]:
with open("settings.txt") as f:
    settings_dict = json.load(f)
seed_val = settings_dict["seed_choice"]
np.random.seed(seed_val)
sns_dict = settings_dict["sns_dict"]
sns.set_theme(**sns_dict)

In [5]:
all_metrics = settings_dict["all_metrics"]

In [6]:
features = pd.read_parquet("data/reduced_balancedfeatures_LATEST.parquet")
features["class"].value_counts()

class
CEP     683
DSCT    683
EB      683
RRL     683
Name: count, dtype: int64

In [7]:
features = features.sample(frac=1)

In [8]:
y_df = features["class"]
X_df = features.drop(["Coordinate_x", "Coordinate_y", "Coordinate_z", "class"], axis=1)

In [9]:
y = y_df.to_numpy()
X = X_df.to_numpy()

### 1. Isolation Forest

In [10]:
from sklearn.ensemble import IsolationForest
from scipy import stats

In [11]:
clf = IsolationForest(max_samples=100, contamination=0.1, random_state=100)
clf.fit(X) #, sample_weight=1.0/(Xfixed[:,2,:])**2)
scores_pred_Norm = clf.decision_function(X)
#scores_pred_Norm = clf.score_samples(X)
#.sample_score(X)

threshold_Norm  = stats.scoreatpercentile(scores_pred_Norm ,100 * 0.1)
y_predNorm = clf.predict(X)

In [12]:
print(scores_pred_Norm.max())

0.13827205494824024


### 2. DCPY Mean Distance

In [14]:
lcdc = dcpy.DistanceMetricClassifier(
    scale=True,
    central_stat="median",
    dispersion_stat="std",
)
lcdc.fit(X,y)

In [31]:
for metric in tqdm(all_metrics, desc="Metric", leave=True):
    metric_str = utils.get_metric_name(metric)
    _ = lcdc.predict_and_analyse(X, metric=metric)

    dist_df = lcdc.centroid_dist_df_
    break

Metric:   0%|          | 0/18 [00:00<?, ?it/s]

In [32]:
dist_df['closest_distance'] = dist_df.min(axis=1)
mean = dist_df['closest_distance'].mean()
std  = dist_df['closest_distance'].std()
dist_df['anomaly_score'] = (dist_df['closest_distance'] - mean) / std
dist_df=(dist_df.sort_values(by="anomaly_score",ascending=False)).reset_index(drop=False)

In [33]:
dist_df

Unnamed: 0,index,CEP_dist,DSCT_dist,EB_dist,RRL_dist,closest_distance,anomaly_score
0,1698,2.221283e+06,4.421163e+06,4.555621e+01,5.301515e+06,45.556213,13.357850
1,1592,3.431627e+01,8.980047e+02,5.600783e+04,1.275726e+04,34.316274,9.352996
2,1062,3.212941e+01,4.580644e+02,7.057781e+03,2.917023e+03,32.129409,8.573804
3,568,3.241179e+01,6.444508e+01,2.663454e+02,3.044971e+01,30.449705,7.975316
4,1242,3.188868e+05,4.514984e+02,5.016597e+07,2.977814e+01,29.778138,7.736033
...,...,...,...,...,...,...,...
2727,2131,7.834741e+00,2.119640e+01,7.771283e+00,4.064401e+00,4.064401,-1.425917
2728,1220,7.356471e+00,4.044531e+00,1.146505e+01,7.671534e+00,4.044531,-1.432996
2729,1762,7.573406e+00,3.821672e+00,7.020998e+00,7.472453e+00,3.821672,-1.512403
2730,176,3.691251e+00,9.670391e+01,2.460666e+01,2.232662e+01,3.691251,-1.558872
