# Starter notebook to work on the Spatial Transcriptomic data from Mosaic

##### Credit goes to Quentin Bayard at Owkin

## Load relevant python libraries

In [None]:
import os
import anndata as ad
import scanpy as sc
import pandas as pd
import numpy as np
import json
from typing import Dict, List, Optional, Union
from matplotlib.pyplot import imread
import liana as li
import decoupler as dc
import omnipath

from gbmhackathon.utils.visium_functions import (
    normalize_anndata_wrapper,
    convert_obsm_to_adata
)
from gbmhackathon.viz.visium_functions import (
    plot_spatial_expression,
    plot_obsm
)
from gbmhackathon.stats.visium_functions import (
    perform_multi_clustering,
    quantify_cell_population_activity
)
from gbmhackathon import MosaicDataset

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 1200

## Load Visium data and create a anndata object

In [None]:
visium_dict = MosaicDataset.load_visium(
    sample_list=["HK_G_022a_vis", "HK_G_024a_vis", "HK_G_030a_vis"], # remove this argument to load all available samples)
    resolution="hires"
)
# Note that loading all the samples may take up to 4 minutes and requires more memory

### Access coordinates of spots (array or pixel coordinates are available)

In [None]:
visium_dict["HK_G_030a_vis"].obs

In [None]:
visium_dict["HK_G_030a_vis"].obsm["spatial"]

## Normalize data
#### - Here we will normalize data using CPM normalization, see scanpy [documentation](https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.normalize_total.html) for more informations
#### - In addition to normalize data, the function will create the `layers` element in anndata to easily access either `raw` counts, `CPM` or `log_CPM` arrays

In [None]:
visium_obj = normalize_anndata_wrapper(visium_dict, target_sum=1e6)

In [None]:
visium_obj["HK_G_030a_vis"].layers

In [None]:
print(visium_obj["HK_G_030a_vis"].layers["raw"].toarray())

In [None]:
print(visium_obj["HK_G_030a_vis"].layers["log_CPM"].toarray())

### Visualisation of genes of interest

In [None]:
plot_spatial_expression(visium_obj,
                        gene_list = ["EGFR", "CD4", "COL1A1", "CD8A", "SPP1", "HIF1A", "VCAN"],
                        layer = "log_CPM",
                        sample_list = ["HK_G_022a_vis", "HK_G_024a_vis", "HK_G_030a_vis"],
                        img_key="hires",
                        save_output="gene_expression.png")

## Perform unsupervised clustering of spots

In [None]:
visium_obj = perform_multi_clustering(visium_obj,
                                       resolution = [0.5, 1],
                                       save_output="clustering.png")

### Access clustering results in obs dataframe of a given sample

In [None]:
visium_obj["HK_G_030a_vis"].obs

## Quantify cell population activity using biomarker genes

In [None]:
visium_obj = quantify_cell_population_activity(
    visium_obj,
    biomarker_dict = {
            "ECM_remodeling": ["COL1A1", "COL3A1", "FN1", "MMP2"],
            "Lymphocytes": ["CD3E", "CD8A", "CD4", "CD19", "CD20", "CD79A"],
            "TAMs": ["CCL4","ADRB2","NAV3","ADORA3","SIGLEC8","SPRY1","TAL1","RHOB","BIN1","SALL1","KLF2","BHLHE41","SLC1A3","P2RY12"],
            "Tumor_cells": ["EGFR", "HER2", "MKI67", "VEGFA", "CD44", "GPC3"],
        }
    #biomarker_dict=None # you can give a dictionary of cell types of interests,
    # with one key per cell type, and as values a list of marker genes
    # otherwise with None, it will use a default dictionary
    # tips: you can look at the "GBMap extended" catalogue to defined your cell types of interests!
)

In [None]:
plot_obsm(visium_obj["HK_G_030a_vis"], "cell_population_activity_normalized", features=["ECM_remodeling", "Lymphocytes", "TAMs", "Tumor_cells"], save_output="cell_population_activity.png")

## Perform pathway enrichment analysis and quantify TF activity using Liana+
### We will use decoupler-py with pathways genesets from PROGENy. See this [tutorial](https://liana-py.readthedocs.io/en/latest/notebooks/misty.html) for details.


In [None]:
sample_focus = "HK_G_030a_vis"

In [None]:
adata_test = visium_obj[sample_focus]

### First, letâ€™s estimate pathway activities as a way to make the data a bit more interpretable. 

In [None]:
# obtain genesets
progeny = dc.get_progeny(organism='human', top=500)

In [None]:
# use multivariate linear model to estimate activity
dc.run_mlm(
    mat=adata_test,
    net=progeny,
    source='source',
    target='target',
    weight='weight',
    verbose=True,
    use_raw=False,
)

In [None]:
# extract progeny activities as an AnnData object
acts_progeny = convert_obsm_to_adata(adata_test, 'mlm_estimate')


In [None]:
sc.pl.spatial(acts_progeny, color=['Hypoxia', 'JAK-STAT', 'p53'], cmap='RdBu_r', size=1.3, save=f"{sample_focus}_progeny_activities.png", show=True)


### Second, use decoupler and liana to estimate Transcription Factor (TF) activities

In [None]:
# get TF prior knowledge
net = dc.get_collectri()

In [None]:
# Estimate activities
dc.run_ulm(
    mat=adata_test,
    net=net,
    verbose=True,
    use_raw=False,
)

In [None]:
adata_test.obsm["ulm_estimate"].head()

In [None]:
plot_obsm(adata_test, "ulm_estimate", features=["HIF1A", "STAT1", "TP53"], save=f"{sample_focus}_TF_activities.png", show=True)