In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import sys
sys.path.append('../')

In [3]:
names = [
    'PowerCons',
    # 'RefrigerationDevices',
    # 'ScreenType',
    'Beef',
    'Coffee',
    'Ham',
    'ItalyPowerDemand',
    'Mallat',
    'Meat',
    'OliveOil',
    'Strawberry',
    'Wine'
]

In [4]:
import numpy as np
from utils.ucr_helpers import (
    evaluate_model_sklearn,
    get_kNN_accuracy_MF_UCR,
    evaluate_resampling_UCR
)
from models.embedding_models import MatrixFactorization

In [5]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=UndefinedMetricWarning)

## Learn and save embeddings for each dataset

In [6]:
from utils.ucr_helpers import UCR_Data
import time
from sklearn.svm import SVC
from tqdm import tqdm
import json

def train_embeddings(data, save=False, verbose=False):
    name = data.name
    if verbose:
        print(f"===== {name} =====")
        print(data.X.shape)
    similarity_setup = {
        "scaled":"normal", "return_similarity":True, "truncate":False
    }
    similarity_matrix = MatrixFactorization.get_euclidean_matrix(
        data.X, **similarity_setup
    )
    # similarity_matrix = MatrixFactorization.get_euclidean_matrix(
    #     np.diff(data.X, axis=1), scaled="normal", return_similarity=True, truncate=False
    # )
    start_time = time.time()
    train_setup = {
        "embedding_dim":32,
        "learning_rate":0.02,
        "epochs":300,
        "regularization_loss_weight":0.1,
        "pairwise_loss_weight":0.1,
        "noise_mask":False,
        "early_stopping":False,
    }
    model, losses, learning_rates = MatrixFactorization.train_MF_model(
        data.X.shape[0],
        similarity_matrix,
        verbose=True,
        **train_setup
    )
    embed_train_time = time.time() - start_time
    # Prepare data for evaluation
    embeddings = model.embeddings.weight.detach().numpy()

    if save:
        save_id = f"{train_setup['embedding_dim']}d"
        np.save(f"../embeddings/ucr/{name}_{save_id}.npy", embeddings)
        with open(f"../embeddings/ucr/{name}_setup_{save_id}.json", 'w', encoding='utf-8') as f:
            json.dump(similarity_setup | train_setup, f, ensure_ascii=False, indent=4)

    return model


In [None]:
SAVE = False
save_id = "32d"

# names = sorted(list(meta_df[meta_df["learned_w"] == 0].Name))
verbose = True
results_dict = {}
for name in tqdm(names):
    if name in ["Crop"]:
        continue
    elif name in results_dict:
        continue
    try:
        data = UCR_Data(name)
        model = train_embeddings(data, save=False, verbose=False)
    except Exception as e:
        raise e

## Classification Evaluation

In [7]:
import pandas as pd
meta_df = pd.read_csv("../Data/UCR_Summary.csv")
meta_df["learned_w"] = meta_df["DTW (learned_w) "].apply(lambda xi: int(xi.split("(")[-1].split(")")[0]))
meta_df.index = meta_df["Name"].str.lower()
meta_df.columns = [c.strip() for c in meta_df.columns]

results_df = pd.read_csv("../Data/UCR_results.csv", delimiter="\t", encoding="utf-16", index_col=0)

In [8]:
# Stops kernal crashing
import os
default_n_threads = 4
os.environ['OPENBLAS_NUM_THREADS'] = f"{default_n_threads}"
# os.environ['MKL_NUM_THREADS'] = f"{default_n_threads}"
# os.environ['OMP_NUM_THREADS'] = f"{default_n_threads}"

In [None]:
from utils.ucr_helpers import get_eval_df, add_meta_info, initialize_eval_df
from utils.ucr_helpers import UCR_Data
import numpy as np
from tqdm import tqdm
from utils.ucr_helpers import highlight_max_in_dataset
from collections import defaultdict
import time

embed_train_times = defaultdict(list)
for i in range(2):
    df = initialize_eval_df()
    # df = pd.read_pickle("eval_square_reg_2.pkl")
    for name in tqdm(names):
    # for name in ["ItalyPowerDemand", "PowerCons", "Mallat"]:
        if name == "Mallat":
            continue
        print(name)
        data = UCR_Data(name)
        # embeddings = np.load(f"../embeddings/ucr/{name}_32d.npy")
        start_time = time.time()
        model = train_embeddings(data, save=False, verbose=False, )
        embed_train_times[name].append(time.time()-start_time)
        embeddings = model.embeddings.weight.detach().numpy()
        save_string = f"eval_{i}.pkl"
        df = get_eval_df(data, embeddings, df=df, n_resamples=25, verbose=False,
                         scale=True, over_sampling=True, suffix="_scaledOS")
        df.to_pickle(save_string)
        # df = get_eval_df(data, embeddings, df=df, n_resamples=25, verbose=False,
        #                  scale=False, over_sampling=True, suffix="_noscaleOS")
        # df.to_pickle(save_string)
        df = get_eval_df(data, embeddings, df=df, n_resamples=25, verbose=False,
                         scale=True, over_sampling=False, suffix="_scaledNOS")
        df.to_pickle(save_string)
        # df = get_eval_df(data, embeddings, df=df, n_resamples=25, verbose=False,
        #                  scale=False, over_sampling=False, suffix="_noscaleNOS")
        # df.to_pickle(save_string)
    del df


##### Get latex table with max in bold

In [9]:
import pandas as pd
df = pd.read_pickle("eval_2.pkl")

In [18]:
from utils.ucr_helpers import apply_styling
# print(styled_df.to_latex(convert_css=True).replace("\\\n\multirow","\\ \midrule\n\multirow"))

In [11]:
# apply_styling(df[df.index.get_level_values("method").str.contains("1NN") & df.index.get_level_values("method").str.contains("noscale")])
# apply_styling(df[~df.index.get_level_values("method").str.contains("raw") & df.index.get_level_values("method").str.contains("1NN")])
# apply_styling(df[df.index.get_level_values("method").str.contains("proposed")])
apply_styling(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1-score,accuracy,classification_time,feature_time
dataset,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Beef,c22 1NN_scaledNOS,0.568303,0.536,0.535598,0.536,0.096881,0.03782
Beef,c22 1NN_scaledOS,0.568303,0.536,0.535598,0.536,0.101216,0.03881
Beef,c22 SVC_scaledNOS,0.487849,0.453333,0.442589,0.453333,0.07817,0.03782
Beef,c22 SVC_scaledOS,0.487849,0.453333,0.442589,0.453333,0.082868,0.03881
Beef,c22 logistic_scaledNOS,0.461173,0.433333,0.42901,0.433333,0.1157,0.03782
Beef,c22 logistic_scaledOS,0.461173,0.433333,0.42901,0.433333,0.1218,0.03881
Beef,prop+rocket 1NN_scaledNOS,0.631277,0.594667,0.590585,0.594667,0.560642,
Beef,prop+rocket 1NN_scaledOS,0.628913,0.592,0.588428,0.592,0.622067,
Beef,prop+rocket SVC_scaledNOS,0.566003,0.548,0.524689,0.548,2.461872,
Beef,prop+rocket SVC_scaledOS,0.558638,0.54,0.51686,0.54,2.480631,


## Clustering Evaluation

In [None]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import KMeans

def clustering_evaluation(features, labels, random_state=42):
    n_clusters = len(set(labels))
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    cluster_labels = kmeans.fit_predict(features)

    silhouette_avg = silhouette_score(features, cluster_labels)
    calinski_harabasz = calinski_harabasz_score(features, cluster_labels)
    davies_bouldin = davies_bouldin_score(features, cluster_labels)
    ari = adjusted_rand_score(labels, cluster_labels)
    ami = adjusted_mutual_info_score(labels, cluster_labels)
    homogeneity = homogeneity_score(labels, cluster_labels)
    completeness = completeness_score(labels, cluster_labels)
    v_measure = v_measure_score(labels, cluster_labels)
    fowlkes_mallows = fowlkes_mallows_score(labels, cluster_labels)

    return {
        'silhouette_score': silhouette_avg,
        'calinski_harabasz_score': calinski_harabasz,
        'davies_bouldin_score': davies_bouldin,
        'adjusted_rand_score': ari,
        'adjusted_mutual_info_score': ami,
        'homogeneity_score': homogeneity,
        'completeness_score': completeness,
        'v_measure_score': v_measure,
        'fowlkes_mallows_score': fowlkes_mallows
    }

In [None]:
MatrixFactorization.plot_with_dimensionality_reduction(embeddings, data.y, method='pca')

In [None]:
name = "Mallat"
embeddings = np.load(f"../embeddings/ucr/{name}_32d.npy")
data = UCR_Data(name)
cluster_out = clustering_evaluation(features=embeddings, labels=data.y)
{k:round(v,4) for k,v in cluster_out.items()}





{'silhouette_score': 0.4186,
 'calinski_harabasz_score': 1542.3509,
 'davies_bouldin_score': 0.9839,
 'adjusted_rand_score': 0.9493,
 'adjusted_mutual_info_score': 0.9626,
 'homogeneity_score': 0.9621,
 'completeness_score': 0.9635,
 'v_measure_score': 0.9628,
 'fowlkes_mallows_score': 0.9557}

In [None]:

c22_features = data.get_sktime_features_unsupervised(X_train=data.X_train, X_test=data.X_test)
cluster_out = clustering_evaluation(features=c22_features, labels=data.y)
{k:round(v,4) for k,v in cluster_out.items()}





{'silhouette_score': 0.4295,
 'calinski_harabasz_score': 10690.0986,
 'davies_bouldin_score': 0.9908,
 'adjusted_rand_score': 0.2645,
 'adjusted_mutual_info_score': 0.4126,
 'homogeneity_score': 0.4101,
 'completeness_score': 0.4211,
 'v_measure_score': 0.4155,
 'fowlkes_mallows_score': 0.3612}