In [1]:
import json, os, sys

from pathlib import Path

import numpy as np
import pandas as pd

from fastdist import fastdist
from numba import jit
from sklearn.metrics import pairwise_distances

from sklearn.cluster import KMeans
from sklearn.metrics import f1_score, accuracy_score

project_dir = os.path.abspath("..")
if project_dir not in sys.path:
    sys.path.insert(0, project_dir)
    
from app.algorithms import random_walk
from app.data.datasets import SyntheticDataSet
from app.data.noise import compute_corruption_ratio_per_class

import utils

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data_results_dir = Path("../data/experiments-skadi/")


In [4]:
job_paths = list(data_results_dir.glob("**/job-info.json"))

In [5]:
len(job_paths)

268

In [6]:
job_path = job_paths[1]

In [7]:
experiment_dir = job_path.parent

In [8]:
with open(experiment_dir / "experiment-params.json", "r") as f:
    experiment_params = json.load(f)

In [9]:
experiment_params

{'n_dim': 10,
 'min_distance': 6,
 'component_size': 50,
 'variance': 1.0,
 'label_noise_proba': 0.0,
 'n_steps': 25,
 'distance_metric': 'sqeuclidean',
 'bias_factor': 2,
 'allow_self_loops': False,
 'random_seed': 8073829}

In [10]:
results_path = job_path.parent / "clustering-results.feather"
df_results = pd.read_feather(results_path)

In [11]:
df_results

Unnamed: 0,noisy_label,true_label,cluster_label
0,0,0,7
1,0,0,7
2,0,0,7
3,0,0,7
4,0,0,7
...,...,...,...
495,9,9,0
496,9,9,0
497,9,9,0
498,9,9,0


In [12]:
df_results = utils.assign_labels_based_on_clusters(
    df_results=df_results,
    new_column_name="pred_label",
)

In [13]:
df_results

Unnamed: 0,noisy_label,true_label,cluster_label,pred_label
0,0,0,7,0
1,0,0,7,0
2,0,0,7,0
3,0,0,7,0
4,0,0,7,0
...,...,...,...,...
495,9,9,0,9
496,9,9,0,9
497,9,9,0,9
498,9,9,0,9


In [14]:
accuracy_score(y_true=df_results["true_label"], y_pred=df_results["pred_label"])

1.0