# Comprehensive Analysis of Datasets with Various Metrics, Norms, and Methods

In [1]:
import sys
sys.path.append('../')

import numpy as np
from AnalysisTools import Ana

# Initialize the analysis object
analysis = Ana(showPlots=True, execution_mode='cpu', cacheStoragePath='/home/diego/disks/ANACACHE')

# Add datasets
analysis.add_dataset(label="IMR90SIM", folder="data/IMR90SIM")
analysis.add_dataset(label='IMR90OPT', folder='data/IMR90OPT')
analysis.add_dataset(label="IMR90OPT57", folder='data/IMR90OPT56')

# Process trajectories for each dataset
analysis.process_trajectories(label="IMR90SIM", filename="traj_chr_IMR90OPT_0.cndb", folder_pattern=['iteration_', [1, 20]])
analysis.process_trajectories(label="IMR90OPT", filename="traj_0.cndb", folder_pattern=['iteration_', [1, 20]])
analysis.process_trajectories(label="IMR90OPT57", filename="traj_0.cndb", folder_pattern=['iteration_', [1, 20]])

  from .autonotebook import tqdm as notebook_tqdm
2024-07-21 01:37:19.147579: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 01:37:19.163467: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 01:37:19.168555: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-21 01:37:19.180367: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Processing file: data/IMR90SIM/iteration_1/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_2/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_3/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_4/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_5/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_6/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_7/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_8/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_9/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_10/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_11/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_12/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_13/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_14/traj_chr_IMR90OPT_0.cndb
Processing file: data/IMR90SIM/iteration_15

## Generate and Cache Distance Matrices

In [2]:
norms = ['ice', 'kr', 'log_transform', 'vc']
metrics = ['euclidean', 'pearsons', 'spearman', 'contact', 'log2_contact']
methods = ['single', 'complete', 'average', 'weighted']
#! note single/spearman/ice leads to division of 0 and all data gets set to 1 to avoid division of 0 error
#! note single/log2_contact/ice leads to division of 0 and all data divided by zero will be set to 0
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.calc_XZ("IMR90OPT", "IMR90OPT57", "IMR90SIM", metric=metric, norm=norm, method=method)

using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'single', 'euclidean', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'complete', 'euclidean', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'average', 'euclidean', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'weighted', 'euclidean', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'single', 'pearsons', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'complete', 'pearsons', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', 'average', 'pearsons', 'ice').pkl.npz
using cached data: /home/diego/disks/ANACACHE/cache_('IMR90OPT', 'IMR90OPT57', 'IMR90SIM', '

2024-07-21 01:38:05.200843: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 01:38:05.223602: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 01:38:05.230600: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-21 01:38:05.245610: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-21 01:38:05.271714: E external/local_xla/xla/

IMR90OPT57 has dist shape (20, 1173, 1173)
Processing IMR90SIM
IMR90SIM has dist shape (100, 1183, 1183)
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <object type:float64> to one
division of zero setting <objec

2024-07-21 01:47:39.586301: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 01:47:39.594657: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 01:47:39.601830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 01:47:39.611280: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 01:47:39.615795: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory fo

IMR90OPT has dist shape (20, 1173, 1173)
Processing IMR90OPT57


  log2_X = np.log2(X + 1)
  log2_X = np.log2(X + 1)
  log2_X = np.log2(X + 1)
  log2_X = np.log2(X + 1)
  log2_X = np.log2(X + 1)


IMR90OPT57 has dist shape (20, 1173, 1173)
Processing IMR90SIM
IMR90SIM has dist shape (100, 1183, 1183)
Flattened distance array has shape: (140, 699153)


ValueError: The condensed distance matrix must contain only finite values.

## Dimensionality Reduction Techniques

### PCA Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.pca("IMR90OPT", "IMR90OPT56", "IMR90SIM", metric=metric, n_components=-1, norm=norm, method=method)

### UMAP Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.umap("IMR90OPT", "IMR90OPT56", "IMR90SIM", metric=metric, num_clusters=-1, norm=norm, method=method)

### t-SNE Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.tsne("IMR90OPT", "IMR90OPT56", "IMR90SIM", metric=metric, num_clusters=-1, norm=norm, method=method)

### MDS Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.mds("IMR90OPT", "IMR90OPT56", "IMR90SIM", metric=metric, n_components=-1, norm=norm, method=method)

### SVD Analysis

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.svd("IMR90OPT", "IMR90OPT56", "IMR90SIM", metric=metric, n_components=-1, norm=norm, method=method)

## Clustering Techniques

### K-means Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.kmeans_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", n_clusters=5, metric=metric, norm=norm, method=method)

### DBSCAN Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.dbscan_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", eps=0.5, min_samples=5, metric=metric, norm=norm, method=method)

### Hierarchical Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.hierarchical_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", n_clusters=5, metric=metric, norm=norm, method=method)

### Spectral Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.spectral_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", num_clusters=-1, metric=metric, norm=norm, method=method)

### OPTICS Clustering

In [None]:
for norm in norms:
    for metric in metrics:
        for method in methods:
            analysis.optics_clustering("IMR90OPT", "IMR90OPT56", "IMR90SIM", min_samples=5, xi=0.05, min_cluster_size=0.05, metric=metric, norm=norm, method=method)