# Score mouse regions in human survival data

In [None]:
import os
import warnings
warnings.filterwarnings('ignore','invalid value encountered in true_divide')

import pandas as pd
import numpy as np
import anndata as ad

import scanpy as sc

import seaborn as sns

import tacco as tc

from statsmodels.stats.multitest import multipletests
import lifelines
kmf = lifelines.KaplanMeierFitter()

In [None]:
import sys
# Make helper functions available: The notebook expects to be executed either in the sub-workflow directory or in the notebooks directory
sys.path.insert(1, '../'), sys.path.insert(1, '../workflow/'); # prefer to look just one directory up
import helper
sys.path.pop(1), sys.path.pop(1);

get_path = helper.get_paths('tcga')
figures_folder = get_path('plots')

## Load data and convert the genes to the common MGI homology classes

In [None]:
tc.tl.setup_orthology_converter(f'{get_path("resources","human_sc")}/MGI/HOM_AllOrganism.rpt');

In [None]:
mouse_slideseq = ad.read(f'{get_path("resources","mouse_slideseq")}/slideseq.h5ad')
mouse_slideseq_by_compartment = ad.read(f'{get_path("resources","mouse_slideseq")}/slideseq_by_compartment.h5ad')

In [None]:
# use only data from sufficiently covered beads
mouse_slideseq = mouse_slideseq[mouse_slideseq.X.sum(axis=1)>=100].copy()
mouse_slideseq_by_compartment = mouse_slideseq_by_compartment[mouse_slideseq_by_compartment.obs['index'].isin(mouse_slideseq.obs.index)].copy()

In [None]:
mouse_slideseq_by_compartment = tc.tl.run_orthology_converter(mouse_slideseq_by_compartment, 'mouse', use_synonyms=False) # no synonyms here to keep integer counts (also all counts are already used without synonyms)

In [None]:
TCGA_adata = ad.read(f'{get_path("data")}/pancanatlas.h5ad')
TCGA_adata = TCGA_adata[TCGA_adata.obs['type'].isin(['READ','COAD'])]
TCGA_adata = TCGA_adata[:,~np.isnan(TCGA_adata.X).any(axis=0)]
TCGA_adata = TCGA_adata[~np.isnan(TCGA_adata.X).any(axis=1),:]
TCGA_adata = TCGA_adata[:,~(TCGA_adata.X < 0).any(axis=0)]
TCGA_adata = tc.tl.run_orthology_converter(TCGA_adata, 'human', use_synonyms=True) # use synonyms here as it increases the amount of data used (and the integer nature of the data is irrelevant)
mouse_slideseq_by_compartment,TCGA_adata = tc.pp.filter([mouse_slideseq_by_compartment,TCGA_adata], return_view=False, remove_constant_genes=True)
# preprocessing for score calculation
sc.pp.log1p(TCGA_adata)
sc.pp.scale(TCGA_adata)

# Get DE genes for the regions

In [None]:
def DEG_regions(selected_regions):
    _sdata = mouse_slideseq_by_compartment
    
    group_key = 'region'

    enrichments = helper.marker_genes(_sdata, group_key, rungo=False, restrict_groups=selected_regions)

    gene_lists = {region:df.sort_values('p_fisher_fdr_bh')['value'].str.upper() for region, df in enrichments.groupby(group_key) if len(df)>0}
    return gene_lists
region_sets = {}
region_sets['aR'] = [ r for r in mouse_slideseq_by_compartment.obs['region'].cat.categories ]
region_sets['mR'] = [ r for r in mouse_slideseq_by_compartment.obs['region'].cat.categories if 'Malignant' in r ]
gene_lists = {k:DEG_regions(rs) for k,rs in region_sets.items()}

# Score bulk data with the DE genes for the regions

In [None]:
def score_adata(adata, score, nDEG=200):
    for region,genes in gene_lists[score].items():
        adata.obs[f'{region}_{score}score_{nDEG}'] = adata[:,genes.head(nDEG)].X.mean(axis=1).copy()
        adata.obs[f'{region}_{score}q4_{nDEG}'] = pd.qcut(adata.obs[f'{region}_{score}score_{nDEG}'],4,['q1','q2','q3','q4'])

In [None]:
def plotit(adata, selected_endpoints, score, nDEGs=None):
    regions = region_sets[score]
    if nDEGs is not None and len(nDEGs) > 1:
        assert(len(selected_endpoints) == 1)
    elif nDEGs is None:
        nDEGs = [200]
    n_y = len(selected_endpoints) if len(nDEGs) == 1 else len(nDEGs)
    
    fig,axs = tc.pl.subplots(len(regions),n_y,sharex=True,sharey=True)

    for i_nDEG,nDEG in enumerate(nDEGs):
        score_adata(adata, score, nDEG=nDEG)

    for endpoint_i,endpoint in enumerate(selected_endpoints):

        subset = adata[~(adata.obs[f'{endpoint}.time'].isna() | adata.obs[f'{endpoint}'].isna())]

        kmf.fit(subset.obs[f'{endpoint}.time'], event_observed=subset.obs[endpoint])

        for i_nDEG,nDEG in enumerate(nDEGs):
            if len(nDEGs) == 1:
                ax_arr = axs[endpoint_i]
            else:
                ax_arr = axs[i_nDEG]
            p_vals = []
            for ax_i,region in enumerate(regions):

                high_sub = subset[subset.obs[f'{region}_{score}q4_{nDEG}']=='q4']
                low_sub = subset[subset.obs[f'{region}_{score}q4_{nDEG}']=='q1']

                p_vals.append(lifelines.statistics.logrank_test(high_sub.obs[f'{endpoint}.time'], low_sub.obs[f'{endpoint}.time'], high_sub.obs[endpoint], low_sub.obs[endpoint], alpha=.95).p_value)

            p_vals_corrected = multipletests(p_vals, alpha=0.05, method='fdr_bh')[1]

            for ax_i,region in enumerate(regions):
                for qx in ['q1','q4']:
                    this_sub = subset[subset.obs[f'{region}_{score}q4_{nDEG}']==qx]

                    kmf.fit(this_sub.obs[f'{endpoint}.time'], event_observed=this_sub.obs[endpoint], label=f'{region}_{qx}')
                    kmf.plot_survival_function(ax=ax_arr[ax_i], show_censors=True)

                ax_arr[ax_i].set_title(f'{endpoint} nDEG={nDEG}\n{region},\np={p_vals[ax_i]:.3f}, p_fdr_bh={p_vals_corrected[ax_i]:.3f}')
                ax_arr[ax_i].legend(loc="lower left")
        
    return fig, axs

# Visualize the scores

In [None]:
selected_score = 'mR'

selected_endpoints = ['OS','PFI']

nDEG = 200

for selected_endpoint in selected_endpoints:
    fig, axs = plotit(TCGA_adata, [selected_endpoint], selected_score, nDEGs=[nDEG]);

    fig.savefig(f'{figures_folder}/{selected_score}_tcga_survival_{selected_endpoint}.pdf',bbox_inches='tight')

## Separately for subtypes

In [None]:
print('n_samples')
print(f'all: {len(TCGA_adata.obs.index)}')
for MSI_status in ['MSI','MSS']:
    print(f'{MSI_status}: {len(TCGA_adata[TCGA_adata.obs["MSIstatus"].astype(str).str.startswith(MSI_status)].obs.index)}')

In [None]:
selected_score = 'mR'

selected_endpoints = ['OS','PFI']

nDEG = 200

for MSI_status in ['MSI','MSS']:
    for selected_endpoint in selected_endpoints:
        fig, axs = plotit(TCGA_adata[TCGA_adata.obs['MSIstatus'].astype(str).str.startswith(MSI_status)].copy(), [selected_endpoint], selected_score, nDEGs=[nDEG]);

        fig.savefig(f'{figures_folder}/{selected_score}_tcga_{MSI_status}_survival_{selected_endpoint}.pdf',bbox_inches='tight')