# Import libraries

In [None]:
import numpy as np
import pandas as pd
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
import kmapper as km
import networkx as nx
import scikit_posthocs as sp
import matplotlib.pyplot as plt
import seaborn as sns
import math
import dyneusr as dyn
import matplotlib.style as style
import warnings
import tqdm

from tda.tda_graph import TDAGraph
from tda.tda_graph_visualizer import TDAVisualizer
from community_detection.community import Community
from tda.tda_mapping import MappingTDA
from harmonization.neurocombat_pyClasse import *
from stats.stats_test import ChiSquaredAnalyzer, FeatureSetStatisticsAnalyzer, ContingencyTableComparator
# from stats.pdf_estimation import PDFEstimator
from tda.tda_mapping import MappingTDA
from graph_metrics.graph_metrics import GraphMetrics
from scipy.spatial.distance import cosine 


from scipy import stats
from scipy.spatial.distance import jensenshannon, euclidean, cityblock, correlation
from statsmodels.stats.multitest import multipletests
from statsmodels.graphics.mosaicplot import mosaic
from itertools import combinations
from tuning.heatmap_creation import Heatmap

from collections import defaultdict, Counter
from sklearn.metrics import adjusted_rand_score, accuracy_score, f1_score, mutual_info_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

import numpy as np
from scipy import stats
from scipy.spatial.distance import cosine
from tqdm import tqdm

# Set random seed for reproducibility
seed_value = 42

# Load data

In [None]:
data_path = '/path/to/your/data'  # Update this path to your data directory

data = pd.read_csv(f'{data_path}/your_data_file.csv')  # Update this filename to your actual dataset
labels = pd.read_csv(f'{data_path}/your_labels_file.csv')  # Update this filename to your actual labels file

data_2 = pd.read_csv(f'{data_path}/your_data_2_file.csv')  # Update this filename to your actual second dataset
labels_2 = pd.read_csv(f'{data_path}/your_labels_2_file.csv')  # Update this filename to your actual second labels file

In [None]:
features_name = data.columns
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled_2 = scaler.fit_transform(data_2)

data_scaled = pd.DataFrame(data_scaled, columns=features_name)
data_scaled_2 = pd.DataFrame(data_scaled_2, columns=features_name)

# TDA

In [None]:
tdagraph = TDAGraph(data_scaled,
                    filter_func='tsne',
                    n_cubes=30,
                    perc_overlap=0.8,
                    cluster_algorithm='dbscan',
                    min_samples=3,
                    eps=8,
                    metric='chebyshev',)

graph, bins, lens, cover = tdagraph.create_graph()

In [None]:
save_path = '/path/to/save/visualization'  # Update this path to your desired save directory
tdagraph_visualizer = TDAVisualizer(graph,
                                    path_html='graph_disc.html',
                                    output_path=save_path)

tdagraph_visualizer.visualize(type_='dyn', target=labels, template=None)

# Community detection

In [None]:
community = Community(graph,
                      algorithm='louvain',
                      seed_value=seed_value,
                      dataset=data,
                      labels_df=labels)

In [None]:
comms = community.community_detection(k=10)
community.compute_modularity()
print(f"Number of communities detected: {len(comms)}")

In [None]:
samples_df = community.get_sample_idx()

filtered_samples_df = community.communities_filtering(samples_df)

valid_ids = labels[labels['Study Group'].isin(['HC', 'ROP', 'ROD'])].index
filtered_samples_small_df = []
for df in filtered_samples_df:
    filtered_df = df[df.index.isin(valid_ids)]
    filtered_samples_small_df.append(filtered_df)

In [None]:
study_group_frequency_filtered = community.get_labels_frequency(filtered_samples_df)
study_group_frequency_filtered

## Chi2 test

In [None]:
chi_square_analyzer_filtered = ChiSquaredAnalyzer(
    study_group_frequency=study_group_frequency_filtered,
    correction_method='fdr_bh'
)

chi_square_analyzer_filtered.run_analysis(verbose=True)

community_entropy_filtered = chi_square_analyzer_filtered.community_entropy()
community_entropy_filtered

## Feature importance ranking and statistical analysis

In [None]:
feature_set_statistics_analyzer = FeatureSetStatisticsAnalyzer(
    labels=labels,
    study_group_frequency=study_group_frequency_filtered,
    correction_method='fdr_bh',
    alpha=0.05
)

results_df, selected_features = feature_set_statistics_analyzer.compute_feature_correlations(data_scaled, lens, 
                                                                                             feature_names=data.columns.tolist(), 
                                                                                             method='pearson',
                                                                                             ratio=0.3)
new_disc_samples_df = []
for df in filtered_samples_small_df:
    new_disc_samples_df.append(df[selected_features])

print(f"Number of selected features: {len(selected_features)}")
print("Selected features:", new_disc_samples_df[0].columns.tolist())
print('')

feature_set_statistics_analyzer.run_global_analysis(new_disc_samples_df)

# TDA Mapping

In [None]:
mappingtda = MappingTDA(
    lens=lens,
    cover=cover,
    bins=bins,
)

communities = community.get_node_idx_communities(merging=False)
for n in range(len(communities)):
    print(f'Number of nodes in community {n}: {len(communities[n])}')

print(f'Total number of nodes in the graph: {len(graph["nodes"])}')

In [None]:
contingency_table_small, comm_overlap_replication = mappingtda.build_contingency_table(
    data=data_scaled_2,
    labels=labels_2,
    contigency_table_constructed=study_group_frequency_filtered,
    tdagraph=tdagraph,
    communities=community,
    return_overlap=True,
    plot_overlap=True,
)

contingency_table_small

## Chi2 test

In [None]:
df_cleaned = contingency_table_small.loc[(contingency_table_small != 0).any(axis=1)]

chi_square_analyzer_filtered = ChiSquaredAnalyzer(
    study_group_frequency=df_cleaned,
    correction_method='fdr_bh'
)

chi_square_analyzer_filtered.run_analysis(verbose=True)