In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pyrootutils

base_path = pyrootutils.setup_root(
    search_from=".",
    indicator=[".gitignore"],
    project_root_env_var=True,  # set the PROJECT_ROOT environment variable to root directory
    dotenv=True,  # load environment variables from .env if exists in root directory
    pythonpath=True,  # add root directory to the PYTHONPATH (helps with imports)
    cwd=True,  # change current working directory to the root directory (helps with filepaths)
)
import pandas as pd
import scanpy as sc
import squidpy as sq
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import matplotlib
import matplotlib as mpl
from itertools import combinations
from tqdm import tqdm
import h5py

from src.data.utils import *
from src.data.datamodules import SpatialDataModule
from src.data.graph_utils import *
from src.utils.data_paths import *

In [None]:
# load h5 file
path = "./data/spatial/anca_visium/test_sample"
adata = sc.read_visium(path)

In [None]:
coords = adata.obsm["spatial"]
norm_coords = normalize_coords(coords)
dist_mat = distance_matrix(norm_coords, norm_coords)

In [None]:
check_graph_construction(coords, num_hops=1, radius=0.0275)

In [None]:
check_graph_construction(coords, num_hops=1, num_layers=1)

In [None]:
adata = sc.read_h5ad("../data/single-cell/kidney/Mature_Full_v3_harmonizedCelltypes.h5ad")

In [None]:
adata.obs

In [None]:
adata.obs["broad_celltype"].unique()

In [None]:
adata.obs["cell_type"].unique()

In [None]:
adata.write_h5ad("../data/spatial/anca_visium/test_sample.h5ad")

In [None]:
adata.shape

In [None]:
sc.pl.spatial(adata, color="CD4", size=1, show=False)

In [None]:
st_data_files = [
    "spatial/simulations_kidney_slideSeq_v2/UMOD-WT.WT-2a_resolution75.h5ad",
    "spatial/simulations_kidney_slideSeq_v2/UMOD-KI.KI-4b_resolution105.h5ad",
    "spatial/simulations_heart_seqFISH/embryo1_resolution0.11.h5ad",
    "spatial/V1_Mouse_Brain_Sagittal_Anterior.h5ad",
    "spatial/lymph_node/st_lymph.h5ad",
]
experiment_dirs = [
    "experiments/experiment_kidney_slideSeq_v2_UMOD-WT.WT-2a_resolution75",
    "experiments/experiment_kidney_slideSeq_v2_105",
    "experiments/experiment_heart_seqFISH_embryo1_resolution0.11-new",
    "experiments/experiment_mouse_st",
    "experiments/experiment_lymph_node",
]
sc_paths = [
    "./data/spatial/kidney_slideSeq_v2/UMOD-WT.WT-2a.h5ad",
    "./data/spatial/kidney_slideSeq_v2_105.h5ad",
    "./data/spatial/heart_seqFISH/embryo1.h5ad",
    "./data/single-cell/Allenbrain_forSimulation_uniquect.h5ad",
    "./data/single-cell/lymph_node/sc_lymph.h5ad", 
]
sc_data_files = [
    "spatial/kidney_slideSeq_v2/UMOD-WT.WT-2a.h5ad",
    "spatial/kidney_slideSeq_v2_105.h5ad",
    "spatial/heart_seqFISH/embryo1.h5ad",
    "single-cell/Allenbrain_forSimulation_uniquect.h5ad",
    "single-cell/lymph_node/sc_lymph.h5ad",
]
sc_paths = ["../data/" + f for f in sc_data_files]
st_paths = ["../data/" + f for f in st_data_files]
experiment_paths = ["../" + dir for dir in experiment_dirs]

In [None]:
st_paths = [
    "./data/spatial/simulations_kidney_slideSeq_v2/UMOD-KI.KI-4b_resolution105.h5ad",
    "./data/spatial/simulations_kidney_slideSeq_v2/UMOD-WT.WT-2a_resolution75.h5ad",
    "./data/spatial/simulations_heart_seqFISH/embryo1_resolution0.11.h5ad",
    "./data/spatial/simulations_heart_seqFISH/embryo2_resolution0.11.h5ad",
    "./data/spatial/simulations_heart_seqFISH/embryo3_resolution0.11.h5ad",
    "./data/spatial/lymph_node/st_lymph.h5ad",
    "./data/spatial/V1_Mouse_Brain_Sagittal_Anterior.h5ad",
]
sc_paths = [
    "./data/spatial/kidney_slideSeq_v2/UMOD-KI.KI-4b.h5ad",
    "./data/spatial/kidney_slideSeq_v2/UMOD-WT.WT-2a.h5ad",
    "./data/spatial/heart_seqFISH/embryo1.h5ad",
    "./data/spatial/heart_seqFISH/embryo2.h5ad",
    "./data/spatial/heart_seqFISH/embryo3.h5ad",
    "./data/single-cell/lymph_node/sc_lymph.h5ad",
    "./data/single-cell/Allenbrain_forSimulation_uniquect.h5ad",
]

## Visualize datasets

In [None]:
idx = -1
sc_data = sc.read_h5ad(sc_paths[idx])
sc_data.obs["Celltype"].value_counts().sum()

In [None]:
idx = 5
celltype = "Cardiomyocytes"
celltype_col = "celltype_mapped_refined"
gene_name = "CD4"
st_path = st_paths[idx]
sc_path = sc_paths[idx]
st_data = sc.read_h5ad(st_path)
sc_data = sc.read_h5ad(sc_path)
# sc_data.obs[celltype] = (sc_data.obs[celltype_col] == celltype).astype(int)

fig, axs = plt.subplots(1, 1, figsize=(5, 5), layout="tight")
sc.pl.spatial(st_data, color=gene_name, show=False, ax=axs)
# sc.pl.spatial(sc_data, color=celltype, show=False, ax=axs[1])
name = st_path.split("/")[-1].split(".")[0]
plt.savefig(f"./figures/datasets/{name}", dpi=200, bbox_inches="tight")

In [None]:
st_data

In [None]:
sc_data

In [None]:
st_path = st_paths[0]
sc_path = sc_paths[0]
experiment_path = experiment_paths[0]

In [None]:
sc.read_h5ad(st_path)

## Number of neighbors per radius for each dataset

In [None]:
st_paths = [f"./data/{path}" for path in st_data_files]

In [None]:
st_data = sc.read_h5ad(st_paths[5])
coords = st_data.obsm["spatial"]

norm_coords, dist_mat, adj_mat = construct_spatial_graph(coords)

mean_first_dist = np.mean(np.sort(dist_mat, axis=1)[:, 1])
print(f"Mean distance to nearest neighbor: {mean_first_dist:.5f}")
np.sort(dist_mat, axis=1)[:, 0:6]


In [None]:
coords.max(axis=0) - coords.min(axis=0)

In [None]:
print(coords.max(axis=0))
print(coords.min(axis=0))

In [None]:
nn_dists = []
for st_path in st_paths:
    st_data = sc.read_h5ad(st_path)
    coords = st_data.obsm["spatial"]
    norm_coords = normalize_coords(coords)
    dist_mat = distance_matrix(norm_coords, norm_coords)
    mean_first_dist = np.mean(np.sort(dist_mat, axis=1)[:, 1])
    print(coords.max(axis=0))
    print(coords.min(axis=0))
    print(f"Mean distance to nearest neighbor: {mean_first_dist:.5f}")
    if mean_first_dist < 0.02:
        nn_dists.append(mean_first_dist)
    # np.sort(dist_mat, axis=1)[:, 0:6]

In [None]:
np.mean(nn_dists)

In [None]:
st_data.uns["spatial"]["V1_Human_Lymph_Node"]["scalefactors"]

In [None]:
sc.read_h5ad(st_paths[-1]).obsm["spatial"]

In [None]:
radii = [0.0001, 0.01, 0.02, 0.03, 0.04, 0.05]

In [None]:
avg_degrees = []
radius_list = []
st_path_list = []
for st_path in st_paths[0::]:
    st_data = load_spatial_data(st_path=st_path)
    print(f"Loaded {st_path}")
    for radius in radii:
        # get average num neighbors for radius
        avg_degree, graph = check_graph_construction(
            st_data.obsm["spatial"], radius=radius, num_hops=1, draw=False, verbose=False
        )
        print(f"Radius: {radius}, Avg degree: {avg_degree}")
        avg_degrees.append(avg_degree)
        radius_list.append(radius)
        st_path_list.append(st_path.split("/")[-1])

In [None]:
# combine everything into a dataframe
df = pd.DataFrame(
    {
        "radius": radius_list,
        "avg_degree": avg_degrees,
        "st_path": st_path_list,
    }
)
# plot with seaborn lineplot
# set style paper
mpl.style.use("seaborn-paper")
sns.lineplot(data=df, x="radius", y="avg_degree", hue="st_path")
# change legend title
plt.legend(title="Dataset")
plt.xlabel("Radius")
plt.ylabel("Average degree")
plt.grid(True)

plt.savefig("../figures/avg_degree_vs_radius.png", dpi=200, bbox_inches="tight")
plt.show()

In [None]:
st_data = load_spatial_data(st_path=st_path)
X_real, X_real_train, X_sim, y_sim = load_prepared_data(experiment_path)

# sc_data = sc.read_h5ad(sc_path)
y_real = st_data.obs[st_data.obs.columns[2::]].to_numpy()
y_real = st_data.obs[st_data.obs.columns[2::]]
sc_data = sc.read_h5ad(f"{sc_path}")

## Relationship between latent dist and cell proportions

In [None]:
# perform pca on st_data and on simulated data
for st_path, sc_path, experiment_path in zip(st_paths, sc_paths, experiment_paths):
    st_data = load_spatial_data(st_path=st_path)
    st_data.var_names_make_unique()
    X_real, X_real_train, X_sim, y_sim = load_prepared_data(experiment_path)
    # sc_data = sc.read_h5ad(sc_path)
    y_real = st_data.obs[st_data.obs.columns[2::]].to_numpy()
    y_real_df = st_data.obs[st_data.obs.columns[2::]]


## Further analysis

In [None]:
sc_data.obs

In [None]:
celltype = "endothelial cell"
celltype = "kidney proximal convoluted tubule epithelial cell"

In [None]:
# get max celltypes for spatial dat
st_data.obs["celltype"] = y_real.idxmax(axis=1)

In [None]:
sc.pl.spatial(st_data, color=celltype, show=False)
plt.savefig(f"{base_path}/figures/grid_slideseq.png", dpi=300, bbox_inches="tight")

In [None]:
sc.pl.spatial(sc_data, color=celltype, show=False)
plt.savefig(f"{base_path}/figures/original_slideseq.png", dpi=300, bbox_inches="tight")

## Analyze relationship between gene expression similarity and spatial distance

### Fist use celltype composition as proxy for gene expression similarity

In [None]:
st_data

In [None]:
coords = normalize_coords(st_data.obsm["spatial"])

In [None]:
celltype_abundances = st_data.obs[st_data.obs.columns[2::]].to_numpy()

In [None]:
dist_mat = scipy.spatial.distance_matrix(coords, coords)

In [None]:
celltype_distances = scipy.spatial.distance_matrix(
    celltype_abundances, celltype_abundances
)

In [None]:
celltype_distances = []
for vec_1 in tqdm(celltype_abundances):
    for vec_2 in celltype_abundances:
        dist = scipy.spatial.distance.jensenshannon(vec_1, vec_2)
        celltype_distances.append(dist)

In [None]:
# analyze correlation between celltype distance and euclidean distance
result = scipy.stats.pearsonr(dist_mat.flatten(), celltype_distances.flatten())

### Consider the full expression profile

In [None]:
sc.pp.filter_genes(st_data, min_cells=10)

In [None]:
st_data

In [None]:
sc.pp.normalize_total(st_data, target_sum=1e4)
sc.pp.log1p(st_data)

In [None]:
# maybe scale the data
# sc.pp.scale(st_data, max_value=10)
sc.tl.pca(st_data, n_comps=50)

In [None]:
pca_embeddings = st_data.obsm["X_pca"]
gene_expr_distances = scipy.spatial.distance_matrix(pca_embeddings, pca_embeddings)

In [None]:
# analyze correlation between celltype distance and euclidean distance
corr, pval = scipy.stats.pearsonr(dist_mat.flatten(), gene_expr_distances.flatten())

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(dist_mat.flatten(), gene_expr_distances.flatten(), s=1)
ax.set_xlabel("euclidean distance")
ax.set_ylabel("gene expression distance in PCA space")
plt.title("euclidean distance vs gene expression distance per spot pair")
# show corr and pval as text
ax.text(
    0.05,
    0.95,
    f"corr: {corr:.2f}, pval: {pval:.6f}",
)

plt.savefig(
    f"{base_path}/figures/euclidean_vs_gene_expr_distance_{experiment_name}.png",
    dpi=300,
    bbox_inches="tight",
)
plt.show()