# Comprehensive Analysis of Datasets with Various Metrics, Norms, and Methods

In [1]:
import sys
sys.path.append('../')

import numpy as np
from AnalysisTools import Ana
from AnalysisTools import ComputeHelpersCPU

# Initialize the analysis object
cachePath = '/home/diego/disks/ANACACHE'
comp = ComputeHelpersCPU(memory_location=cachePath, memory_verbosity=0, n_jobs=8)
analysis = Ana(showPlots=True, execution_mode=comp, cacheStoragePath=cachePath)

# Add datasets
analysis.add_dataset(label="IMR90SIM", folder="data/IMR90SIM")
analysis.add_dataset(label='IMR90OPT', folder='data/IMR90OPT')
analysis.add_dataset(label="IMR90OPT57", folder='data/IMR90OPT56')

# Process trajectories for each dataset
analysis.process_trajectories(label="IMR90SIM", filename="traj_chr_IMR90OPT_0.cndb", folder_pattern=['iteration_', [1, 20]])
analysis.process_trajectories(label="IMR90OPT", filename="traj_0.cndb", folder_pattern=['iteration_', [1, 20]])
analysis.process_trajectories(label="IMR90OPT57", filename="traj_0.cndb", folder_pattern=['iteration_', [1, 20]])

  from .autonotebook import tqdm as notebook_tqdm
2024-07-22 16:01:21.566496: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Processing file: data/IMR90SIM/iteration_1/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_2/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_3/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_4/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_5/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_6/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_7/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_8/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_9/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_10/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_11/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_12/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_13/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_14/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_15

## Define norm x method x metrix

In [2]:
norms = ['ice', 'kr', 'log_transform', 'vc']
metrics = ['euclidean', 'pearsons', 'spearman', 'contact', 'log2_contact']
methods = ['single', 'complete', 'average', 'weighted']

## Generate and Cache Distance Matrices

In [None]:
#! note single/spearman/ice leads to division of 0 and all data gets set to 1 to avoid division of 0 error
#! note single/log2_contact/ice leads to division of 0 and all data divided by zero will be set to 0
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.calc_XZ("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, norm=norm, method=method)

## Dimensionality Reduction Techniques

### PCA Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        analysis.pca("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_components=1, norm=norm, n_clusters=2)

## Expiremental Scaler with best pca norm x method x metric dataset

In [None]:
#_weighted_euclidean_log_transform.png
metrics = ['euclidean', 'pearsons', 'spearman', 'contact', 'log2_contact']
norms = ['ice', 'kr', 'log_transform', 'vc']
from AnalysisTools.Plot_Helper import PlotHelper

for norm in norms:
    for metric in metrics:
        X, Z = analysis.calc_XZ("IMR90SIM", "IMR90OPT", "IMR90OPT57", method="weighted", metric=metric, norm=norm, overrideCache=True, expiremental=True)
        n_components = 1
        pca, exp, com = comp.run_reduction('pca', X, 1)

        plot_params = {
            'outputFileName': f"test00_{norm}_{metric}",
            'cmap': 'viridis',
            'title': f'PCA of test',
            'x_label': 'PC1',
            'y_label': 'PC2' if n_components > 1 else 'Principal Component 1',
            'z_label': 'PC3' if n_components > 2 else None,
            'n_components': n_components,
            'n_clusters': 2,
            'method': "weighted",
            'metric': "euclidean",
            'norm': "log_transform",
            'n_components_95': 1,
            'size': 50,
            'alpha': 0.7,
        }

        if n_components > 1:
            plot_params['y_label'] = f'PC2 ({exp[1]:.2%} variance)'
        else:
            plot_params['y_label'] = 'Samples'


        pl = PlotHelper()
        pl.plot(plot_type="pcaplot", data=(pca, exp, com), plot_params=plot_params)


### UMAP Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.umap("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_clusters=2, norm=norm, method=method, n_components=1)

### t-SNE Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.tsne("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_clusters=2, norm=norm, method=method, n_components=1)

### MDS Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.mds("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_components=1, norm=norm, method=method)

### SVD Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.svd("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_components=1, norm=norm, method=method, n_clusters=2)

### IVIS Analysis

In [4]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.ivis_clustering("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, n_components=1, norm=norm, method=method, n_clusters=2)

## Clustering Techniques

### K-means Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.kmeans_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", n_clusters=5, metric=metric, norm=norm, method=method)

### DBSCAN Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.dbscan_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", eps=0.5, min_samples=5, metric=metric, norm=norm, method=method)

### Hierarchical Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.hierarchical_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", n_clusters=5, metric=metric, norm=norm, method=method)

### Spectral Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.spectral_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", num_clusters=-1, metric=metric, norm=norm, method=method)

### OPTICS Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.optics_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", min_samples=5, xi=0.05, min_cluster_size=0.05, metric=metric, norm=norm, method=method)