In [1]:
import scanpy as sc
import plotnine as p9

import liana as li
import decoupler as dc # needed for pathway analysis

import numpy as np
import pandas as pd

In [2]:
import muon as mu
import mofax as mofa

## Preprocess

In [3]:
adata = li.testing.datasets.kang_2018()

In [6]:
adata

AnnData object with n_obs × n_vars = 24673 × 15706
    obs: 'nCount_RNA', 'nFeature_RNA', 'tsne1', 'tsne2', 'condition', 'cluster', 'cell_type', 'patient', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.4', 'seurat_clusters', 'sample', 'cell_abbr'
    var: 'name'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts'

In [5]:
sample_key = 'sample'
groupby = 'cell_abbr'

In [7]:
adata

AnnData object with n_obs × n_vars = 24673 × 15706
    obs: 'nCount_RNA', 'nFeature_RNA', 'tsne1', 'tsne2', 'condition', 'cluster', 'cell_type', 'patient', 'nCount_SCT', 'nFeature_SCT', 'integrated_snn_res.0.4', 'seurat_clusters', 'sample', 'cell_abbr'
    var: 'name'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts'

## MOFAcell

In [None]:
from liana.multi import adata_to_views, lrs_to_views, get_variable_loadings, get_factor_scores

In [None]:
mdata = adata_to_views(adata,
                       groupby=groupby,
                       sample_key=sample_key,
                       obs_keys=['condition', 'patient'],
                       min_prop=0.05, # min nnz values (filter features)
                       min_smpls=3, # min samples per view (filter features)
                       min_cells=25, # min cells per view (filter samples)
                       min_counts=100, # min counts per view (filter samples)
                       mode='sum', # mode of aggregation
                       verbose=True
                       )

In [None]:
mdata

#### Normalize and get HVGs for each view

In [None]:
for view in mdata.mod.keys():
    sc.pp.normalize_total(mdata.mod[view], target_sum=1e4)
    sc.pp.log1p(mdata.mod[view])
    sc.pp.highly_variable_genes(mdata.mod[view], flavor='cell_ranger')

In [None]:
# check how a view looks
mdata.mod['B']

Run MOFA

In [None]:
mu.tl.mofa(mdata,
           use_obs='union',
           convergence_mode='medium',
           verbose=True,
           n_factors=5,
           outfile='models/mofacell.h5ad'
           )

In [None]:
factor1_loadings =  get_variable_loadings(mdata, 0, view_separator=':')
factor1_loadings

In [None]:
?get_variable_loadings

In [None]:
top_genes = factor1_loadings['variable'].head(25)

In [None]:
top_genes

In [None]:
top_loadings = factor1_loadings[factor1_loadings['variable'].isin(top_genes)]

In [None]:
import plotnine as p9

In [None]:
# dotplot of variable, view, loadings
(p9.ggplot(top_loadings) + 
 p9.aes(x='view', y='variable', fill='loadings') + 
 p9.geom_tile() +
 p9.scale_fill_gradient2(low='blue', mid='lightgray', high='red') + 
 p9.theme_minimal() +
 p9.theme(axis_text_x=p9.element_text(angle=90, hjust=0.5, vjust=0.5), figure_size=(5, 5))
 )

In [None]:
factor_scores = get_factor_scores(mdata, obsm_key='X_mofa')

In [None]:
# scatterplot
(p9.ggplot(factor_scores) +
 p9.aes(x='condition', colour='condition', y='Factor_1') +
 p9.geom_violin() +
 p9.geom_jitter(size=4, width=0.2) +
 p9.theme_bw()
 )

In [None]:
mdata.obsm['X_mofa'].shape

R-squared per view

In [None]:
model = mofa.mofa_model("models/mofacell.h5ad")
model

In [None]:
mofa.plot_r2(model, x='View')

Genes with high loadings

In [None]:
# Check loadings for CD14+ Monocytes
mofa.plot_weights(model, views=['CD14'], factors="Factor1", label_size=10)

In [None]:
model.close()

In [None]:
sc.pp.neighbors(mdata, use_rep="X_mofa")
sc.tl.umap(mdata, random_state=1)
sc.pl.umap(mdata, frameon=False, color=['condition', 'patient'])

In [None]:
mdata.varm['LFs'].shape

In [None]:
mdata

In [None]:
factor_scores

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(
            warm_start=True,
            max_features=None,
            oob_score=True,
            random_state=0,
        )

In [None]:
X = mdata.obsm['X_mofa']
y = mdata.obs['condition']

In [None]:
# Range of `n_estimators` values to explore.
min_estimators = 2
max_estimators = 150
error_rate = []

In [None]:
for i in range(min_estimators, max_estimators + 1, 5):
        clf.set_params(n_estimators=i)
        clf.fit(X, y)
        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = 1 - clf.oob_score_
        error_rate.append((i, oob_error))

In [None]:
clf

In [None]:
# plotnine line plot
(p9.ggplot(pd.DataFrame(error_rate, columns=['n_estimators', 'oob_error'])) + p9.aes(x='n_estimators', y='oob_error') + p9.geom_line())

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(clf, X, y, cv=8, scoring='f1_macro')

In [None]:
print("%0.2f mean accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

### MOFAtalk

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [None]:
li.mt.rank_aggregate.by_sample(
    adata,
    groupby=groupby,
    sample_key=sample_key, # sample key by which we which to loop
    use_raw=False,
    verbose=True, # use 'full' to show all information
    n_perms=100, # reduce permutations for speed
    return_all_lrs=False, # important for how missing values are handled
    )

### dataframe_to_views

In [None]:
mdata = lrs_to_views(adata,
                     score_key='magnitude_rank',
                     obs_keys=['patient', 'condition'],
                     verbose=True
                     )

In [None]:
mu.tl.mofa(mdata, 
           use_obs='union',
           convergence_mode='medium',
           outfile='models/talk.h5ad',
           n_factors=5,
           save_metadata=True)

In [None]:
model = mofa.mofa_model('models/talk.h5ad')

In [None]:
model.get_r2()

In [None]:
sc.pp.neighbors(mdata, use_rep="X_mofa")
sc.tl.umap(mdata, random_state=1)
sc.pl.umap(mdata, frameon=False, color=['condition', 'patient'])

### Extract MOFA values

Both functions should work with mdata getting passed

In [None]:
factor1_loadings = get_variable_loadings(mdata, 0, view_separator=':', variable_separator='^', pair_separator='&')

In [None]:
factor1_loadings.drop(columns=['view:variable', 'view', 'variable'], inplace=True)

In [None]:
factor1_loadings['size'] = 5

In [None]:
my_plot = li.pl.dotplot(liana_res = factor1_loadings,
              size='size',
              colour='loadings', 
              orderby='loadings',
              top_n=20,
              source_labels=['NK', 'B', 'CD4T', 'CD8T'],
              orderby_ascending=False,
              size_range=(0.1, 6),
              figure_size=(8, 8)
              )

In [None]:
# change colour, with mid as white
my_plot + p9.scale_color_gradient2(low='blue', mid='lightgray', high='red')

Get Factor Scores

In [None]:
factor_scores = get_factor_scores(mdata, obsm_key='X_mofa')

In [None]:
# scatterplot
(p9.ggplot(factor_scores) +
 p9.aes(x='condition', colour='condition', y='Factor_1') +
 p9.geom_violin() +
 p9.geom_jitter(size=4, width=0.2) +
 p9.theme_bw()
 )