<a href="https://colab.research.google.com/github/EricKenjiLee/WaveMAP_Paper/blob/main/WaveMAP_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import all packages


In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import datasets
from PIL import Image
from io import BytesIO
import base64


from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, ContinuousColorMapper
from bokeh.palettes import Turbo256
from bokeh.transform import linear_cmap
from bokeh.transform import factor_cmap

# from scipy.io import loadmat

from os.path import dirname, join as pjoin
import scipy.io as sio

from umap import umap_ as umap
import networkx as nx
import igraph as ig

import matplotlib.pyplot as plt

## ECG needs to be defined and added manually to iGraph

In [3]:
def community_ecg(self, weights=None, ens_size=16, min_weight=0.05):
    W = [0]*self.ecount()
    ## Ensemble of level-1 Louvain 
    for i in range(ens_size):
        p = np.random.permutation(self.vcount()).tolist()
        g = self.permute_vertices(p)
        l = g.community_multilevel(weights=weights, return_levels=True)[0].membership
        b = [l[p[x.tuple[0]]]==l[p[x.tuple[1]]] for x in self.es]
        W = [W[i]+b[i] for i in range(len(W))]
    W = [min_weight + (1-min_weight)*W[i]/ens_size for i in range(len(W))]
    ## Force min_weight outside 2-core
    core = self.shell_index()
    ecore = [min(core[x.tuple[0]],core[x.tuple[1]]) for x in self.es]
    w = [W[i] if ecore[i]>1 else min_weight for i in range(len(ecore))]
    part = self.community_multilevel(weights=w)
    part.W = w
    part.CSI = 1-2*np.sum([min(1-i,i) for i in w])/len(w)
    return part

ig.Graph.community_ecg = community_ecg

## Loading the waveform data

In [4]:
data_dir = 'data'

mat_fname = pjoin(data_dir, 'waveforms_mean.mat')
fmnist = sio.loadmat(mat_fname)
fmnist_subset = fmnist["waveforms_mean"]


(363, 181)

## Bootstrap UMAP to get confidence score

In [5]:
# Define the number of bootstraps
n_bootstraps = 100
confidence_scores = np.zeros((fmnist_subset.shape[0],))
cluster_counts = []

# Bootstrap clustering
for _ in range(n_bootstraps):
    # Resample 80% of the data
    subset, _ = resample(fmnist_subset, n_samples=int(0.8 * len(fmnist_subset)), replace=False)
    
    # UMAP dimensionality reduction
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(subset)
    
    # Convert UMAP graph to NetworkX graph and then to igraph
    G = nx.from_scipy_sparse_matrix(reducer.graph_)
    umap_igraph = ig.Graph(len(G), list(zip(*list(zip(*nx.to_edgelist(G)))[:2])))
    
    # Perform ECG community detection
    umap_ECG = umap_igraph.community_ecg(ens_size=10, min_weight=0.5)
    
    # Assign clusters back to the original dataset
    full_cluster_assignment = np.zeros(fmnist_subset.shape[0], dtype=int) - 1
    full_cluster_assignment[:len(umap_ECG.membership)] = umap_ECG.membership
    
    # Update confidence scores
    for idx in range(fmnist_subset.shape[0]):
        if full_cluster_assignment[idx] != -1:
            confidence_scores[idx] += (confidence_scores[idx] == full_cluster_assignment[idx])
    
    # Track the number of clusters
    cluster_counts.append(len(set(umap_ECG.membership)))

# Calculate final confidence scores
confidence_scores /= n_bootstraps


NameError: name 'resample' is not defined

### Plot results

In [None]:
# Plot confidence scores for cells
plt.figure(figsize=(10, 6))
sns.histplot(confidence_scores, bins=30, kde=True)
plt.title('Confidence Scores for Cell Clustering')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.show()

# Plot confidence scores for the number of clusters
plt.figure(figsize=(10, 6))
sns.histplot(cluster_counts, bins=30, kde=True)
plt.title('Confidence Scores for Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Frequency')
plt.show()

### Save confidence scores

In [None]:
# Save the confidence scores to a CSV file
confidence_df = pd.DataFrame({
    'cell_index': np.arange(fmnist_subset.shape[0]),
    'confidence_score': confidence_scores
})
confidence_df.to_csv('analysis_output/confidence_scores.csv', index=False)