In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import altair as alt
import novosparc
import random 
import os
random.seed(20)

# Reconstructing Osteosarcoma MERFISH

Here we wish to reconstruct the spatial expression of a single batch from cultured cells of human osteosarcoma, reported by [MERFISH](https://www.pnas.org/content/116/39/19490).
Since cells are dissociated and only briefly cultured side-by-side, we will use only markers for reconstruction.

Specifically, we show here:
1. Reconstruction with a specific marker
2. Improved reconstruction with number of markers
3. Expression reconstructed with few markers, in particular, highlighting expression microenvironments

* Note: [altair](https://altair-viz.github.io/), the visualization library used here currently [cannot display its produced plots within github notebooks](https://github.com/altair-viz/altair/issues/2264). Therefore, plots are saved as htmls in output_osteosarcoma

In [2]:
# params
dataname = 'osteosarcoma'
coords_cols=['xcoord', 'ycoord']
gois = ['FGF18', 'SMAD3'] # two genes showing the microenvironments according to MERFISH paper
ngois = len(gois)

batch = 1
data_path = 'novosparc/datasets/%s/dge_%d.txt' % (dataname, batch)
target_space_path = 'novosparc/datasets/%s/geometry_%d.txt' % (dataname, batch)

In [3]:
# plotting 
max_pts = 5000
pt_size = 100
    
labelFontSize=15
labelFontSizeb=17
titleFontSize=20
fontSize=20

outdir = 'output_osteosarcoma/'

def to_paper(pl):
    pl = pl.configure_axis(labelFontSize=labelFontSize, titleFontWeight='normal', titleFontSize=titleFontSize)
    pl = pl.configure_title(fontSize=fontSize)
    pl = pl.configure_legend(titleFontSize=labelFontSize,labelFontSize=labelFontSize) 
    return pl 
                             
def to_zscore(df_goi):
    return (df_goi - df_goi.mean(0)).div(df_goi.std(0))


def plot_df_sp_chart(df_sp, tit='', gois=gois):
    """
    Plots spatial expression.
    : param df_sp: dataframe containing locations and expression for genes-of-interest (gois)
    """
    cdf_sp = df_sp.copy()
    vals = cdf_sp['true_pred'].unique()
    for v in vals:
        cdf_sp.loc[cdf_sp['true_pred'] == v, gois] = to_zscore(cdf_sp.loc[cdf_sp['true_pred'] == v, gois])

    scale_axis = alt.Scale(nice=False)
    axis_rmgrid = alt.Axis(grid=False, values=[-1000, 0, 1000])
    header_size = alt.Header(labelFontSize=labelFontSizeb, labelFontWeight='bold')
    scale_color=alt.Scale(domain=(-2, 0, 2),range=[ 'blue', 'white', 'red'])
    
    pl = alt.Chart(cdf_sp, title=tit).mark_circle(size=pt_size).transform_fold(gois, ['goi', 'Expression'])
    pl = pl.encode( x=alt.X('xcoord:Q', scale=scale_axis, axis=axis_rmgrid), 
                    y=alt.Y('ycoord:Q', scale=scale_axis, axis=axis_rmgrid), 
                    row=alt.Row('goi:N', sort=None, title=None, header=header_size),
                    column=alt.Column('true_pred:N', title=None,  header=header_size),
                    color=alt.Color('Expression:Q', scale=scale_color))
    return pl

def plot_df_sp(df_sp, tit='', gois=gois, fname=None):
    """
    Plots spatial expression.
    : param df_sp: dataframe containing locations and expression for genes-of-interest (gois)
    """
    pl = plot_df_sp_chart(df_sp=df_sp, tit=tit, gois=gois)
    pl = pl.configure_axis(grid=False).configure_view(strokeOpacity=0)
    pl = to_paper(pl)
    pl.display()
    if fname:
        fname = os.path.join(outdir, fname)
        to_paper(pl).save(fname)
    return pl

In [4]:
# quantitative evaluation of reconstruction
def comp_corr(df_dge1, df_dge2):
    """
    Computes the correlation of expression for each gene between true and predicted spatial expression.
    """
    corrs = {}
    for g in df_dge1.columns:
        corrs[g] = np.corrcoef(df_dge1[g], df_dge2[g])[0,1]
    return corrs

def get_corrs_median(gene_corrs):
    """Computes median of all gene correlations"""
    return np.median([v for _,v in gene_corrs.items()])

## Read Data

In [5]:
# read data
dataset = novosparc.io.load_data(data_path)
sc.pp.normalize_per_cell(dataset)
sc.pp.log1p(dataset)

locations = novosparc.io.load_target_space(target_space_path, coords_cols=coords_cols)

df_dge = dataset.to_df() # cells x genes
df_locs = pd.DataFrame(locations, columns=coords_cols)
df_dge.index = df_locs.index
ncells,ngenes = df_dge.shape
genenames = df_dge.columns

df_sp_true = pd.concat((df_dge[gois], df_locs), 1)
df_sp_true['true_pred'] = 'True'

# plotting the true expression
pl_true = plot_df_sp(df_sp_true, tit='True Spatial Expression', fname='gois_true.html')

## Reconstruction with a Single, Set Marker

Here we attempt reconstruction with a single gene. 

In [6]:
# construct tissue
tissue = novosparc.cm.Tissue(dataset=dataset, locations=locations)

# params
idx = np.arange(ngenes)
alpha_linear = 1.0
df_marker = df_dge # using expression from data as markers

# using a specific gene, e.g. PKM
marker_name = 'PKM'
marker_idx = np.where(genenames == marker_name)[0]
marker_names = [marker_name]

# compute linear cost only
tissue.setup_linear_cost(marker_idx, df_marker[marker_names].values)

# reconstruct
tissue.reconstruct(alpha_linear=alpha_linear, verbose=False)

df_sdge = pd.DataFrame(tissue.sdge.T, columns=genenames)

gene_corrs = comp_corr(df_dge, df_sdge)

df_sp_pred = pd.concat((df_sdge[gois].astype('float32'), pd.DataFrame(locations, columns=coords_cols)), 1)
df_sp_pred['true_pred'] = 'Pred'

tit = 'Using %s Marker Only (%.02f)' %  (marker_name, get_corrs_median(gene_corrs))
pl_single_marker = plot_df_sp(pd.concat((df_sp_true, df_sp_pred),0), tit=tit, fname='gois_pred_using_%s.html' % marker_name)

Trying with epsilon: 5.00e-04


We see that already with a single marker we get a really good reconstruction:
- the median expression correlation across all genes of the predicted and true expression is high
- expression microenvironments are recovered

## Assessing Reconsctruction with Multiple Markers

Overall, we see improvement in reconstruction when using more (random) marker genes. 
Note that the benefit of a single marker gives a relative conservative correlation baseline.
When adding another marker there is a big variability in how much is the reconstruction improved. 
With 4-8 markers we already get a nearly perfect recovery.


In [7]:
n_repeats = 20
alpha_linear = 1.0
idx = np.arange(len(genenames))

n_markers = [1, 2, 4, 8]

results_sl = [] 


for n_marker in n_markers:
    for i in np.arange(n_repeats):
        np.random.shuffle(idx)
        marker_idx = idx[:n_marker]
        marker_names = genenames[marker_idx]
        
        tissue.setup_linear_cost(marker_idx, df_marker[marker_names].values) # only choice of markers varies

        tissue.reconstruct(alpha_linear=alpha_linear, verbose=False)

        df_sdge = pd.DataFrame(tissue.sdge.T, columns=genenames)

        gene_corrs = comp_corr(df_dge, df_sdge)
        median = get_corrs_median(gene_corrs)
        results_sl.append({'median gene corr': median,
                          'num markers': n_marker,
                          'gois': df_sdge[gois].astype('float32')})


df_results_sl = pd.DataFrame(results_sl)

Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying with epsilon: 5.00e-04
Trying wit

In [8]:
pl_width = 200
tit = 'Improving Reconstruction with Marker Genes'
base = alt.Chart(df_results_sl[['median gene corr', 'num markers']], title=tit, width=pl_width)
scale_y = alt.Scale(domain=[0.0,1.1])
pl_sl = base.mark_boxplot().encode(x='num markers:N', y=alt.Y('median gene corr:Q', scale=scale_y), color='num markers:N')

fname = os.path.join(outdir, 'reconstruction_corr_nmarkers.html')
to_paper(pl_sl).save(fname)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## Visualizing Recovery of Microenvironments

Visualizing the predicted expression we see an overall agreement of median reconstruction and reconstruction of microenvironments.
We also see the variability of reconstruction depending on the selected (random) markers.

In [9]:
# plot recovery
goi_to_plot = gois

# subsetting plots
n_plots_per_nmarker = 3
nmarker_groups = df_results_sl.groupby('num markers').groups
idx_plots = [list(v[:n_plots_per_nmarker]) for k,v in nmarker_groups.items()]

pls_nmarkers = []
for i,row in df_results_sl.loc[np.concatenate(idx_plots)].iterrows():
    tmp = row['gois'][goi_to_plot].to_frame() if len(goi_to_plot) == 1 else row['gois'][goi_to_plot]

    df_sp_pred = pd.concat((tmp, pd.DataFrame(locations, columns=coords_cols)), 1)
    df_sp_pred['true_pred'] = 'Pred'

    tit = 'num marker %d, median gene corr %.02f' % (row['num markers'], row['median gene corr'])
    pls_nmarkers.append(plot_df_sp_chart(pd.concat((df_sp_true, df_sp_pred),0), tit=tit, gois=goi_to_plot))
    
# concat and plot
pl_nmarkers = alt.vconcat(*pls_nmarkers)
pl_nmarkers = pl_nmarkers.configure_axis(grid=False).configure_view(strokeOpacity=0)
pl_nmarkers = to_paper(pl_nmarkers)
pl_nmarkers.display()
fname = os.path.join(outdir, 'gois_pred_nmarkers.html')
pl_nmarkers.save(fname)