In [None]:
import scanpy as sc
import plotnine as p9

import liana as li
import cell2cell as c2c
import decoupler as dc # needed for pathway analysis

import warnings

import numpy as np
import pandas as pd

In [None]:
import os

In [None]:
data_dir = "data/rico123/"

Load & Append

In [None]:
metadata = pd.read_csv(data_dir + "metadata.csv", index_col=0)

In [None]:
metadata['sample'] = [ x.split("_")[-1] for x in metadata.index ]

In [None]:
# list files, extract istr before .h5ad
files = os.listdir(data_dir)

In [None]:
adatas = []
for f in files:
    if f.endswith(".h5ad"):
        adata = sc.read_h5ad(data_dir + f)
        sample = f.split("_")[-1][:-5]
        
        if sample in metadata['sample'].values:
            
            idx = adata.obs.index + sample
            
            adata.obs['sample'] = sample
            adata.obs = adata.obs.merge(metadata, on='sample')
            
            adata.obs.index = idx
            
            adatas.append(adata)
        
        

In [None]:
adata = sc.concat(adatas)

In [None]:
del adatas

In [None]:
import gc
gc.collect()

Normalize

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

HVGs & PCA, NN, UMAP

In [None]:
sc.pp.highly_variable_genes(adata)
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['sample'], wspace=1)

Integrate

In [None]:
import bbknn

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="cell_ranger", batch_key='sample')

In [None]:
adata_bbknn = adata[:, adata.var["highly_variable"]].copy()
adata_bbknn

In [None]:
sc.pp.pca(adata_bbknn)

In [None]:
neighbors_within_batch = 25 if adata_bbknn.n_obs > 100000 else 3

In [None]:
bbknn.bbknn(
    adata_bbknn, batch_key='sample', neighbors_within_batch=neighbors_within_batch
)
adata_bbknn

In [None]:
sc.tl.leiden(adata_bbknn)
sc.tl.umap(adata_bbknn)

In [None]:
sc.pl.umap(adata_bbknn, color=['sample', 'leiden'], wspace=0.5)

Remove weird stuff

In [None]:
# celltype_msk = adata_bbknn.obs['leiden'].isin(['5', '6'])

In [None]:
# sample_msk = adata_bbknn.obs['sample']=='AKK002_157779'

In [None]:
# clusters = adata_bbknn.obs[['leiden']]

In [None]:
adata.obs['leiden'] = adata_bbknn.obs[['leiden']]

In [None]:
# adata = adata[~(celltype_msk | sample_msk), :]

In [None]:
from liana.method.sp import lr_basis

In [None]:
# We don't the images, or info for them ;(
# sc.pl.spatial(local_lr, color=['FN1&ITGA5_ITGB1', 'TIMP1&CD63'], cmap='cividis', size=3)

In [None]:
lrdatas = []
for sample in adata.obs['sample'].unique():
    adata_sample = adata[adata.obs['sample'] == sample]
    
    li.mt.get_spatial_proximity(adata=adata_sample, parameter=100, bypass_diagonal=False, cutoff=0.1)
    
    lr_basis(adata_sample, function_name='jaccard', use_raw=False)
    temp = li.fun.obsm_to_adata(adata_sample, 'local_scores')
    lrdatas.append(temp)

In [None]:
# how should we merge them?
lrdata = sc.concat(lrdatas, join='inner', fill_value=np.nan)

In [None]:
lrdata.obs

In [None]:
sc.pp.neighbors(lrdata)
sc.tl.umap(lrdata)

In [None]:
sc.pl.umap(lrdata, color=['sample', 'leiden'])

In [None]:
lrdata

In [None]:
lrdata

In [None]:
mdata = li.multi.adata_to_views(lrdata,
                                groupby='leiden',
                                sample_key='sample',
                                obs_keys=None, # TODO this does not work...?
                                min_prop=0.05, # min nnz values (filter features)
                                min_smpls=3, # min samples per view (filter features)
                                min_cells=25, # min cells per view (filter samples)
                                min_counts=0, # min counts per view (filter samples)
                                mode='mean', # mode of aggregation
                                verbose=True,
                                skip_checks=True
                                )

In [None]:
mdata.obs['sample'] = mdata.obs.index

In [None]:
index = mdata.obs.index
mdata.obs = mdata.obs.merge(lrdata.obs[['major_labl', 'sample']].drop_duplicates())
mdata.obs.index = index

In [None]:
from mudata import MuData
import muon as mu
import mofax as mofa

In [None]:
model = mu.tl.mofa(mdata, 
                   use_obs='union',
                   convergence_mode='medium',
                   outfile="models/lrs.hdf5",
                   n_factors=5
                   )

In [None]:
model = mofa.mofa_model("models/lrs.hdf5")

In [None]:
# obtain factor scores
factor_scores = li.multi.get_factor_scores(mdata, obsm_key='X_mofa')
factor_scores.head()

In [None]:
factor1_loadings =  li.multi.get_variable_loadings(mdata, 0, view_separator=':') # get loadings for factor 1
factor1_loadings.head()

In [None]:
# get top 25 genes across all views
top_genes = factor1_loadings['variable'].drop_duplicates().head(25)
top_loadings = factor1_loadings[factor1_loadings['variable'].isin(top_genes)]

In [None]:
# plot them
# dotplot of variable, view, loadings
(p9.ggplot(top_loadings) + 
 p9.aes(x='view', y='variable', fill='loadings') + 
 p9.geom_tile() +
 p9.scale_fill_gradient2(low='#1f77b4', mid='lightgray', high='#c20019') + 
 p9.theme_minimal() +
 p9.theme(axis_text_x=p9.element_text(angle=90, hjust=0.5, vjust=0.5), figure_size=(5, 10))
 )

In [None]:
model

In [None]:
mofa.plot_r2(model, x='View')

In [None]:
factor_scores

In [None]:
# scatterplot
(p9.ggplot(factor_scores) +
 p9.aes(x='major_labl', colour='major_labl', y='Factor_1') +
 p9.geom_boxplot() +
 p9.geom_jitter(size=4, width=0.2) +
 p9.theme_bw()
 )