In [1]:
import pandas as pd
import numpy as np
import os
import scanpy as sc

from umap import UMAP
import matplotlib.pyplot as plt
import seaborn as sns
import decoupler as dc

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
# see deconv.py
def lookup(model_call, sample_entry):
    if model_call == "all":
        reg_model = "All"
    elif model_call == "condition":
        if sample_entry["Condition"] == "Control":
            reg_model = "Control"
        elif sample_entry["Condition"] == "MS":
            reg_model = "MS"
        else:
            raise ValueError("Unknown condition")
    elif model_call == "lesion_type":
        if sample_entry["lesion_type"] == "Ctrl":
            reg_model = "Control"
        elif sample_entry["lesion_type"] == "CI":
            reg_model = "CI"
        elif sample_entry["lesion_type"] == "CA":
            reg_model = "CA"
        elif sample_entry["lesion_type"] == "A":
            reg_model = "A"
        else:
            raise ValueError("Unknown lesion type")
    else:
        raise ValueError("Unknown model")
    return reg_model

In [3]:
current_path = globals()["_dh"][0]
out_file = current_path / ".." / ".." / "data" / "prc" / "vis" / "mofa_test.hdf5"
visium_path = current_path / ".." / ".." / "data" / "raw" / "vis"
c2l_path = current_path / ".." / ".." / "data" / "prc" / "vis" / "c2l_out" / "cellranger"
img_features = current_path / ".." / ".." / "data" / "prc" / "images" / "squdipy_features"
visium_samples = [f for f in os.listdir(visium_path) if not f.startswith(".")]
print(np.array(visium_samples))

['MS377I' 'CO40' 'MS377N' 'CO85' 'MS229' 'MS377T' 'CO41' 'CO37' 'CO96'
 'MS371' 'MS197D' 'MS586' 'MS411' 'MS94' 'CO74' 'MS371N' 'MS497I' 'MS466'
 'MS549T' 'MS549H' 'MS497T' 'MS197U']


In [4]:
sample_meta = pd.read_excel(current_path / ".." / ".." / "data" / "Metadata_all.xlsx", sheet_name="Visium")
sample_meta

Unnamed: 0,patient_id,sample_id,Condition,lesion_type,Age,Sex,RIN,Batch,visium,snRNA-seq
0,MS94 A1D9,MS94,MS,CA,42,F,8.7,1,True,False
1,MS197 P2D3,MS197U,MS,CA,52,F,9.0,1,True,True
2,MS197 P2D3,MS197D,MS,CA,52,F,9.0,1,True,True
3,MS229 P2C2,MS229,MS,CA,53,M,7.0,1,True,True
4,MS371 A3D3,MS371,MS,A,40,M,7.9,1,True,False
5,MS371 A3D6,MS371N,MS,A,40,M,7.6,3,True,True
6,MS377 A2D2,MS377N,MS,CA,50,F,8.9,3,True,True
7,MS377 A2D4,MS377I,MS,CA,50,F,6.5,1,True,True
8,MS377 A2D4,MS377T,MS,CA,50,F,6.5,1,True,True
9,MS411 A2A2,MS411,MS,CA,61,M,5.9,1,True,True


In [5]:
adata = sc.read_h5ad(img_features / "MS549H.h5ad")
adata.obsm

AxisArrays with keys: histogram, spatial, summary, texture

In [6]:
feature = "summary"
adata.obsm[feature]

Unnamed: 0,summary_ch-0_quantile-0.9,summary_ch-0_quantile-0.5,summary_ch-0_quantile-0.1,summary_ch-0_mean,summary_ch-0_std,summary_ch-1_quantile-0.9,summary_ch-1_quantile-0.5,summary_ch-1_quantile-0.1,summary_ch-1_mean,summary_ch-1_std,summary_ch-2_quantile-0.9,summary_ch-2_quantile-0.5,summary_ch-2_quantile-0.1,summary_ch-2_mean,summary_ch-2_std
AAACAAGTATCTCCCA-1,153.0,119.0,92.0,120.495734,23.030155,142.0,99.0,72.0,102.348476,27.254392,129.0,108.0,97.0,110.281994,12.911767
AAACACCAATAACTGC-1,186.0,150.0,118.0,151.131524,25.872333,181.0,143.0,109.0,143.932853,26.396003,170.0,145.0,131.0,147.991247,14.965083
AAACAGAGCGACTCCT-1,155.0,121.0,97.0,123.489307,22.276498,149.0,103.0,78.0,108.401440,27.875363,132.0,113.0,103.0,115.362105,12.096027
AAACAGCTTTCAGAAG-1,170.0,141.0,113.0,141.023269,21.879365,162.0,130.0,100.0,130.761108,23.317676,143.0,131.0,122.0,131.719224,8.022272
AAACAGGGTCTATATT-1,174.0,136.0,108.0,137.943934,24.694758,171.0,124.0,93.0,127.487424,27.962462,145.0,129.0,117.0,130.196898,11.084451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTGTTGTGTGTCAAGA-1,151.0,117.0,95.0,120.474571,22.714449,151.0,99.0,75.0,104.531302,29.129622,131.0,109.0,99.0,112.576288,13.875472
TTGTTTCACATCCAGG-1,185.0,151.0,114.0,150.381939,26.668421,179.0,146.0,109.0,145.196122,26.247610,166.0,146.0,130.0,147.347812,13.682264
TTGTTTCATTAGTCTA-1,189.0,154.0,120.0,154.669363,25.754071,185.0,150.0,113.0,149.180831,26.520920,166.0,149.0,136.0,150.239003,11.911453
TTGTTTCCATACAACT-1,165.0,135.0,110.0,135.931191,21.478493,156.0,124.0,96.0,125.442770,23.492578,141.0,128.0,118.0,128.766648,8.862661


In [7]:
def read_slide(sample_id, visium_path, c2l_path):

    # get sample metadata
    sample_entry = sample_meta.loc[sample_meta.sample_id == sample_id, :].to_dict(orient="records")[0]

    # Read rna-seq
    slide = sc.read_visium(visium_path / sample_id / "outs")
    slide.var_names_make_unique()
    
    sc.pp.filter_genes(slide, min_cells=3)
    sc.pp.filter_cells(slide, min_genes=200)

    # Store raw counts
    slide.raw = slide
    slide.layers["counts"] = slide.X.copy()

    # Normalize
    sc.pp.normalize_total(slide, target_sum=1e4)
    sc.pp.log1p(slide)

    # Read props and abunds
    for model_call in ["all", "condition", "lesion_type"]:
        suffix = lookup(model_call, sample_entry)
        for output in ["abunds", "props"]:
            m = pd.read_csv(c2l_path / sample_id / f"cell_{output}_{suffix}.csv", index_col=0)
            inter = slide.obs.index.intersection(m.index)
            slide.obsm[f"{output}_{model_call}"] = m.loc[inter]

    # Read image features
    adata_img = sc.read_h5ad(img_features / f"{sample_id}.h5ad")
    for feature in ["summary", "histogram", "texture"]:
        m = adata_img.obsm[feature]
        inter = slide.obs.index.intersection(m.index)
        slide.obsm[feature] = m.loc[inter]

    return slide

In [8]:
vis_dict = {s: read_slide(s, visium_path, c2l_path) for s in visium_samples}

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [9]:
vis_dict["MS549H"]

AnnData object with n_obs × n_vars = 3598 × 20205
    obs: 'in_tissue', 'array_row', 'array_col', 'n_genes'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'
    uns: 'spatial', 'log1p'
    obsm: 'spatial', 'abunds_all', 'props_all', 'abunds_condition', 'props_condition', 'abunds_lesion_type', 'props_lesion_type', 'summary', 'histogram', 'texture'
    layers: 'counts'

In [None]:
fig, axs = plt.subplots(4, 6, figsize=(12, 8))
axs = axs.flatten()
for i, s in enumerate(visium_samples):
    sns.histplot(vis_dict[s].obsm["abunds"].sum(axis=1), ax=axs[i])
    axs[i].set_title(s)
plt.tight_layout()
plt.show()

In [None]:
msigdb = dc.get_resource('MSigDB')

In [None]:
# get hallmark db
hallmark = msigdb[msigdb['collection']=='hallmark'] # filter by hallmark
hallmark = hallmark[~hallmark.duplicated(['geneset', 'genesymbol'])] # remove duplicates
hallmark.loc[:, 'geneset'] = [name.split('HALLMARK_')[1] for name in hallmark['geneset']] # rename for consistency
hallmark = hallmark.loc[:, ['geneset', 'genesymbol']] # reorder columns
hallmark

In [None]:
# get progeny db
progeny = dc.get_progeny(top=300)
progeny = progeny.rename(columns={'source': 'geneset', 'target': 'genesymbol'})
progeny = progeny.loc[:, ['geneset', 'genesymbol', 'weight']] # reorder columns
progeny

In [None]:
# get reactome db
reactome = msigdb[msigdb['collection'] == 'reactome_pathways']
reactome = reactome.loc[:, ['geneset', 'genesymbol']] # reorder columns
reactome = reactome[~reactome.duplicated(['geneset', 'genesymbol'])]
reactome

In [None]:
len(reactome.geneset.unique())

In [None]:
for key, adata in vis_dict.items():
    print(key)
    for pkn, pkn_name in zip([hallmark, progeny, reactome], ["hallmark", "progeny", "reactome"]):
        print(pkn_name)
        dc.run_ulm(
            mat=adata,
            net=pkn,
            source="geneset",
            target="genesymbol",
            weight="weight" if pkn_name in ["progeny"] else None,
            verbose=True,
            use_raw=True)
        adata.obsm[f"{pkn_name}_estimates"] = adata.obsm["ulm_estimate"]
        adata.obsm[f"{pkn_name}_pvals"] = adata.obsm["ulm_pvals"]
        del adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]

In [None]:
vis_dict["CO37"].obsm_keys()

In [None]:

reactome = msigdb[msigdb['collection'] == 'reactome_pathways']

In [None]:
reactome = msigdb[msigdb['collection'] == 'reactome_pathways']
reactome