In [None]:
import scanpy as sc
import patpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
datasets = ["combat", "stephenson", "onek1k", "hlca"]

sample_keys = {
    "combat": "scRNASeq_sample_ID",
    "stephenson": "sample_id",
    "onek1k": "donor_id",
    "hlca": "donor_id",
}

In [None]:
import time

for dataset in datasets:
    start_time = time.time()
    print("Working on dataset:", dataset)
    layer = "X_pca" if dataset == "hlca" else "X_scpoli"  # HLCA doesn't have scPoli cell embeddings computed
    
    adata = sc.read_h5ad(f"../data/{dataset}/{dataset}_processed.h5ad")
    print(adata)    

    gloscope = patpy.tl.GloScope_py(sample_key=sample_keys[dataset], layer=layer, use_gpu=True)
    gloscope.prepare_anndata(adata)
    prior_distance_matrix = gloscope.calculate_distance_matrix(force=True)
    prior_distance_matrix = pd.DataFrame(prior_distance_matrix, index=gloscope.samples, columns=gloscope.samples)    

    adata.uns["gloscope_representation"] = prior_distance_matrix
    adata.write_h5ad(f"../data/{dataset}/{dataset}_processed.h5ad")
    end_time = time.time()
    print(f"Time taken for {dataset}: {end_time - start_time} seconds")


# Compute graph statistics for smarter subsetting of sample

In [49]:
import cudf
import cugraph

In [None]:
for dataset in datasets:
    print("Working on dataset:", dataset)

    adata = sc.read_h5ad(f"../data/{dataset}/{dataset}_processed.h5ad")

    if "eigenvector_centrality" in adata.obs.columns:
        print("Eigenvector centrality already computed, skipping...")
        continue
    
    sc.pp.neighbors(adata=adata, use_rep="X_scANVI_batch")
    
    # Convert AnnData connectivities (sparse matrix) to COO
    coo = adata.obsp['connectivities'].tocoo()

    # Build cuDF edge list
    edges = cudf.DataFrame({
        'src': coo.row.astype(np.int32),
        'dst': coo.col.astype(np.int32),
        'weight': coo.data
    })

    # Create cuGraph graph (undirected for kNN)
    G = cugraph.Graph(directed=False)
    G.from_cudf_edgelist(edges, source='src', destination='dst', edge_attr='weight')

    eig = cugraph.eigenvector_centrality(G, max_iter=1000, tol=1e-6).to_pandas().set_index('vertex')
    adata.obs['vertex'] = np.arange(adata.n_obs)  # Create unique integer IDs because the `eig` has a different order
    adata.obs['eigenvector_centrality'] = adata.obs['vertex'].map(eig['eigenvector_centrality']).values

    ax = sns.histplot(
        data=adata.obs,
        x="eigenvector_centrality",
        bins=20,
        kde=False,
        log_scale=True,
    )
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.title("Distribution of Eigenvector Centrality Values for dataset: " + dataset)

    sns.despine()
    plt.show()

    adata.write_h5ad(f"../data/{dataset}/{dataset}_processed.h5ad")