# Data

> For data management

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
# | export

## helper functions
import roux.lib.df as rd #noqa

def get_net_data(
    adata,
    n: int = 5,  # number of nodes per subset/group
) -> tuple:
    """Get demo data from the latent space

    Args:
        adata (_type_): Annot data of `scanpy`
        n (int, optional): rows to sample. Defaults to 5.

    Returns:
        tuple: nodes, edges
    """
    import logging
    import pandas as pd
    import roux.lib.df as rd

    ## nodes
    logging.info("processing nodes:")
    df01 = pd.concat(
        [
            adata.obs.reset_index().rename(columns={"index": "cell id"}),
            pd.DataFrame(adata.obsm["X_umap"], columns=["x", "y"]),
        ],
        axis=1,
    )
    if not n is None:
        df1 = (
            df01.log()
            .groupby("bulk_labels", as_index=False, observed=False)
            .apply(lambda df: df.sample(n, random_state=0))
            .reset_index(drop=True)
            .log()
        )
    # df1.head(1)
    df1 = df1.sort_values("bulk_labels")

    import pandas as pd
    from scipy.spatial.distance import pdist

    # Convert DataFrame to a NumPy array of coordinates
    coordinates = df1[["x", "y"]].values
    # Calculate pairwise Euclidean distances
    distances = pdist(coordinates, metric="euclidean")

    from itertools import combinations

    nodes = df1["cell id"].tolist()
    pair_indices = list(combinations(nodes, 2))

    ## edges
    logging.info("processing edges:")
    df02 = pd.DataFrame(
        {
            "cell id1": [t[0] for t in pair_indices],
            "cell id2": [t[1] for t in pair_indices],
            "distance": distances,
        },
    )
    df2 = df02.log.query(expr=f"`distance` < {df02['distance'].quantile(0.1)}")
    # df2.head(1)
    logging.info(f"{len(df1)} nodes and {len(df2)} edges")
    return df1, df2


In [None]:
## demo data
import scanpy as sc
nodes,edges=get_net_data(sc.datasets.pbmc68k_reduced())

In [None]:
nodes.head(1)

Unnamed: 0,cell id,bulk_labels,n_genes,percent_mito,n_counts,S_score,G2M_score,phase,louvain,x,y
0,CAGACAACAAAACG-7,CD4+/CD25 T Reg,1109,0.012702,3779.0,-0.054589,-1.267457,G1,0,-7.174749,-6.994063


In [None]:
edges.head(1)

Unnamed: 0,cell id1,cell id2,distance
0,CAGACAACAAAACG-7,GTGATGACTGGGAG-4,1.298669


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()