In [None]:
import pandas as pd
import numpy as np
import scipy
import scanpy as sc
import re
import tacco as tc

In [None]:
import sys
# Make helper functions available: The notebook expects to be executed either in the sub-workflow directory or in the notebooks directory
sys.path.insert(1, '../'), sys.path.insert(1, '../workflow/'); # prefer to look just one directory up
import helper
sys.path.pop(1), sys.path.pop(1);

get_path = helper.get_paths('mouse_slideseq')

# settings

## visualization settings

In [None]:
compartment_colors = helper.get_colors('compartment')
labels_colors = helper.get_colors('labels')
cluster_colors = helper.get_colors('cluster')
program_colors = helper.get_colors('program')
region_colors = helper.get_colors('region')
mostly_gray_region_colors = helper.get_colors('mostly_gray_region')

In [None]:
figures_folder = get_path('plots')
puck_point_size = 3

In [None]:
# utility function for plotting
def state_plot_grid(padding=None):
    # fixed mapping of sample to axis to sort by state
    fig,axs = tc.pl.subplots(6,2,x_padding=padding,y_padding=padding)
    state_axes = np.empty_like(axs, shape=(1,10))
    state_axes[0,0] = axs[1,0]
    state_axes[0,1] = axs[1,1]
    state_axes[0,2] = axs[1,2]
    state_axes[0,3] = axs[1,3]
    state_axes[0,4] = axs[0,0]
    state_axes[0,5] = axs[0,1]
    state_axes[0,6] = axs[0,2]
    state_axes[0,7] = axs[0,3]
    state_axes[0,8] = axs[0,4]
    state_axes[0,9] = axs[0,5]
    axs[1,4].axis('off')
    axs[1,5].axis('off')
    return state_axes

## analysis settings

In [None]:
enrichment_method = {'reduction':'sum','normalization':'clr','method':'welch','assume_counts':True,}
p_key = f'p_{enrichment_method["method"]}_fdr_bh'

# Load mouse data

In [None]:
reference = sc.read(f'{get_path("resources","mouse_sc")}/scRNAseq.h5ad')
tdata = sc.read(f'{get_path("resources")}/slideseq.h5ad')

In [None]:
tdata.obs['regionState'] = np.where(tdata.obs['State'] == 'normal', 'normal',np.where(tdata.obs['region'].str.contains('Malignant-like'), 'AV: malignant-like', 'AV: normal-like'))
tdata.obsm['cluster_short'] = tdata.obsm['cluster'].rename(columns=helper.map_short)

In [None]:
pucks_by_cluster = sc.read(f'{get_path("resources")}/slideseq_by_cluster.h5ad')

In [None]:
pucks_by_cluster.obs['cluster_short'] = pucks_by_cluster.obs['cluster'].map(helper.map_short)

In [None]:
pucks_by_compartment = sc.read(f'{get_path("resources")}/slideseq_by_compartment.h5ad')

In [None]:
region_malignancy_order = tc.tl.get_compositions(tdata, 'State', 'region', reads=True).sort_values('premalignant').index.to_numpy()

In [None]:
# use only data from sufficiently covered beads
full_tdata = tdata.copy()
full_tdata.obs['all_beads'] = 'all_beads'
tdata = tdata[tdata.X.sum(axis=1)>=100].copy()
pucks_by_cluster = pucks_by_cluster[pucks_by_cluster.obs['index'].isin(tdata.obs.index)].copy()
pucks_by_compartment = pucks_by_compartment[pucks_by_compartment.obs['index'].isin(tdata.obs.index)].copy()

In [None]:
def get_n_maximum_annotations(adata, obsm_key, max_annotation=1):
    cell_type = adata.obsm[obsm_key].copy()
    _cell_type = cell_type.to_numpy()
    nth_largest_values_per_observation = np.partition(_cell_type, -max_annotation, axis=-1)[:,-max_annotation]
    _cell_type[_cell_type<nth_largest_values_per_observation[:,None]] = 0
    _cell_type /= _cell_type.sum(axis=1)[:,None]
    cell_type = pd.DataFrame(_cell_type, index=cell_type.index, columns=cell_type.columns)
    return cell_type

In [None]:
tdata.obsm['cluster3'] = get_n_maximum_annotations(tdata, 'cluster', 3)
tc.utils.merge_annotation(tdata, 'cluster3', {l:cs.to_list() for l,cs in reference.obs[['cluster','labels']].drop_duplicates().groupby('labels')['cluster']}, result_key='labels3');

In [None]:
# prepare spatial sample split
tc.utils.split_spatial_samples(tdata, buffer_thickness=400, split_scheme=(2,2), sample_key='SampleID', result_key='SampleID_split', check_splits=False)

for group_key in ['region','State','regionState']:
    # remove all split+group with less than 100 observations
    tdata.obs['SampleID_split+group'] = tdata.obs['SampleID_split'].astype(str) + tdata.obs[group_key].astype(str)
    lowly_covered = tdata.obs['SampleID_split+group'].value_counts()<100
    lowly_covered = lowly_covered[lowly_covered].index
    split_covered_group_key = f'split_covered_{group_key}'
    tdata.obs[split_covered_group_key] = tdata.obs[group_key]
    tdata.obs.loc[tdata.obs['SampleID_split+group'].isin(lowly_covered),split_covered_group_key] = None
    tdata.obs[split_covered_group_key] = tdata.obs[split_covered_group_key].astype('category')

del tdata.obs['SampleID_split+group']

# Spatial overview

In [None]:
axs = state_plot_grid(padding=1.0,)
full_tdatas = { sample: full_tdata[df.index] for sample, df in full_tdata.obs.groupby('SampleID') }
fig = tc.pl.scatter(full_tdatas,'all_beads',colors={'all_beads':'#ddd'},joint=True,point_size=puck_point_size, ax=axs);
tdatas = { sample: tdata[df.index] for sample, df in tdata[~tdata.obsm['Epithelial_programs'].isna().any(axis=1)].obs.groupby('SampleID') }
fig = tc.pl.scatter(tdatas,'region',colors=region_colors,joint=True,point_size=puck_point_size, ax=axs);
fig.savefig(f'{figures_folder}/mouse_slideseq_scatter_region.pdf',bbox_inches='tight')

In [None]:
axs = state_plot_grid(padding=1.0,)
full_tdatas = { sample: full_tdata[df.index] for sample, df in full_tdata.obs.groupby('SampleID') }
fig = tc.pl.scatter(full_tdatas,'all_beads',colors={'all_beads':'#ddd'},joint=True,point_size=puck_point_size, ax=axs);
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') }
fig = tc.pl.scatter(tdatas,'labels',joint=True,point_size=puck_point_size, ax=axs, colors=labels_colors);
fig.savefig(f'{figures_folder}/mouse_slideseq_scatter_labels.pdf',bbox_inches='tight')

In [None]:
state_tdatas = { f'{state} (spatial)': tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }
state_references = { f'{state} (reference)': reference[df.index].copy() for state, df in reference.obs.groupby('State') if len(df) > 0 and state in ['normal','premalignant'] }
# reorder to have different data of the same state next to each other
state_all = {}
for state in ['normal','premalignant']:
    state_all[f'{state} (reference)'] = state_references[f'{state} (reference)']
    state_all[f'{state} (spatial)'] = state_tdatas[f'{state} (spatial)']

fig = tc.pl.frequency_bar(state_all,'cluster',colors=cluster_colors,horizontal=False, method_labels={'cluster':''},axsize=(3,8),reads=True);

fig.savefig(f'{figures_folder}/mouse_slideseq_composition_cluster_vs_state.pdf',bbox_inches='tight')

In [None]:
state_tdatas = { f'{state} (spatial)': tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }
state_references = { f'{state} (reference)': reference[df.index].copy() for state, df in reference.obs.groupby('State') if len(df) > 0 and state in ['normal','premalignant'] }
# reorder to have different data of the same state next to each other
state_all = {}
for state in ['normal','premalignant']:
    state_all[f'{state} (reference)'] = state_references[f'{state} (reference)']
    state_all[f'{state} (spatial)'] = state_tdatas[f'{state} (spatial)']
fig = tc.pl.frequency_bar(state_all,'Epithelial_programs',colors=program_colors,horizontal=False, method_labels={'Epithelial_programs':''},axsize=(3,5),reads=True);

fig.savefig(f'{figures_folder}/mouse_slideseq_composition_program_vs_state.pdf',bbox_inches='tight')

In [None]:
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(tdata,'cluster3','split_covered_region',sample_key='SampleID_split',**enrichment_method,reads=True,)
tdata_enrichments.rename(columns={'split_covered_region':'region','cluster3':'cluster'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'cluster','State',sample_key='SampleID',**enrichment_method,reads=True,)
tdata_enrichments['State'] = tdata_enrichments['region']
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=enrichments['State'].unique(), ordered=True))
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=['normal (reference)',*region_malignancy_order,'premalignant (reference)'], ordered=True))

for annotate_pvalues in [False]:
    fig = tc.pl.significances(enrichments, p_key, 'cluster', 'State', annotate_pvalues=annotate_pvalues);

    fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_cluster3_vs_region{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(tdata,'Epithelial_programs','split_covered_region',sample_key='SampleID_split',**enrichment_method,reads=True,)
tdata_enrichments.rename(columns={'split_covered_region':'region'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'Epithelial_programs','State',sample_key='SampleID',**enrichment_method,reads=True,)
tdata_enrichments['State'] = tdata_enrichments['region']
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=enrichments['State'].unique(), ordered=True))
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=['normal (reference)',*region_malignancy_order,'premalignant (reference)'], ordered=True))

sorting_enr = enrichments.query(f'State=="normal (reference)" & enrichment=="enriched"').set_index('Epithelial_programs')[p_key]
sorting_dep = enrichments.query(f'State=="normal (reference)" & enrichment!="enriched"').set_index('Epithelial_programs')[p_key]
program_order = pd.DataFrame({'enr':sorting_enr,'dep':sorting_dep}).sort_values(['enr','dep'],ascending=[True,False]).index.to_numpy()
enrichments['Epithelial_programs'] = enrichments['Epithelial_programs'].astype(pd.CategoricalDtype(categories=program_order, ordered=True))

for annotate_pvalues in [False]:
    fig = tc.pl.significances(enrichments, p_key, 'Epithelial_programs', 'State', annotate_pvalues=annotate_pvalues);

    fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_program_vs_region{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
enrichments = tc.tl.enrichments(tdata,'region','split_covered_State',sample_key='SampleID_split',**enrichment_method,reference_group='normal',reads=True,)
enrichments.rename(columns={'split_covered_State':'State'}, inplace=True)
enrichments['region'] = enrichments['region'].astype(pd.CategoricalDtype(categories=region_malignancy_order, ordered=True))

for annotate_pvalues in [False]:
    fig = tc.pl.significances(enrichments, p_key, 'region', 'State', annotate_pvalues=annotate_pvalues);

    fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_region_vs_state{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
tdata.obs['$log_{10}(UMI)$'] = np.log10(tc.sum(tdata.X,axis=1))
#tdatas = { f"{sample} ({tdata[df.index].obs['State'].iloc[0]})": tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') }
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') }
axs = state_plot_grid(padding=1.5)
fig = tc.pl.scatter(tdatas,'$log_{10}(UMI)$',cmap='viridis',joint=True,point_size=puck_point_size, ax=axs, cmap_vmin_vmax=[tdata.obs['$log_{10}(UMI)$'].min(),tdata.obs['$log_{10}(UMI)$'].max()]);
fig.savefig(f'{figures_folder}/mouse_slideseq_scatter_umis.pdf', bbox_inches='tight')

In [None]:
tdata.obs['$log_{10}(nGenes)$'] = np.log10(tc.sum(tdata.X!=0,axis=1))
#tdatas = { f"{sample} ({tdata[df.index].obs['State'].iloc[0]})": tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') }
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') }
axs = state_plot_grid(padding=1.5)
fig = tc.pl.scatter(tdatas,'$log_{10}(nGenes)$',cmap='viridis',joint=True,point_size=puck_point_size, ax=axs, cmap_vmin_vmax=[tdata.obs['$log_{10}(nGenes)$'].min(),tdata.obs['$log_{10}(nGenes)$'].max()]);
fig.savefig(f'{figures_folder}/mouse_slideseq_scatter_ngenes.pdf', bbox_inches='tight')

In [None]:
marker_genes = ['Epcam','Reg4','Ptprc','Cd14','Cd3e','Pecam1','Thy1','Grem1']
tdata.obsm['marker genes'] = pd.DataFrame((tdata[:,marker_genes].X.toarray()),columns=marker_genes,index=tdata.obs.index)
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') if sample in ['cont8_P','AV10a_P'] }
fig = tc.pl.scatter(tdatas,'marker genes',joint=None,point_size=puck_point_size,padding=1.0);
fig.savefig(f'{figures_folder}/mouse_slideseq_scatter_marker.pdf', bbox_inches='tight')

In [None]:
state_tdata = tdata[tdata.obs['State'].isin(['normal','premalignant'])]
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(state_tdata,'cluster3','split_covered_State',sample_key='SampleID_split',**enrichment_method,reference_group='normal',reads=True,)
tdata_enrichments.rename(columns={'split_covered_State':'State','cluster3':'cluster'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'cluster','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=True)
tdata_enrichments['State'] = tdata_enrichments['State'].map(lambda x: f'{x} (spatial)')
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
state_order = []
for state in ['normal VS rest','premalignant VS normal']:
    state_order.extend([f'{state} (reference)',f'{state} (spatial)'])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=state_order, ordered=True))
enrichments['cluster'] = enrichments['cluster'].astype(pd.CategoricalDtype(categories=cluster_colors.keys(), ordered=True))

fig = tc.pl.significances(enrichments, p_key, 'cluster', 'State');

fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_cluster3_vs_state.pdf',bbox_inches='tight')

In [None]:
state_tdata = tdata[tdata.obs['State'].isin(['normal','premalignant'])]
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(state_tdata,'Epithelial_programs','split_covered_State',sample_key='SampleID_split',**enrichment_method,reference_group='normal',reads=True,)
tdata_enrichments.rename(columns={'split_covered_State':'State'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'Epithelial_programs','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=True,)
tdata_enrichments['State'] = tdata_enrichments['State'].map(lambda x: f'{x} (spatial)')
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
state_order = []
for state in ['normal VS rest','premalignant VS normal']:
    state_order.extend([f'{state} (reference)',f'{state} (spatial)'])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=state_order, ordered=True))

sorting_enr = enrichments.query(f'State=="normal VS rest (reference)" & enrichment=="enriched"').set_index('Epithelial_programs')[p_key]
sorting_dep = enrichments.query(f'State=="normal VS rest (reference)" & enrichment!="enriched"').set_index('Epithelial_programs')[p_key]
program_order = pd.DataFrame({'enr':sorting_enr,'dep':sorting_dep}).sort_values(['enr','dep'],ascending=[True,False]).index.to_numpy()
enrichments['Epithelial_programs'] = enrichments['Epithelial_programs'].astype(pd.CategoricalDtype(categories=program_order, ordered=True))

fig = tc.pl.significances(enrichments, p_key, 'Epithelial_programs', 'State');

fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_program_vs_state.pdf',bbox_inches='tight')

In [None]:
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID')}

fig = tc.pl.frequency_bar(tdatas,'region',colors=region_colors,horizontal=False, method_labels={'region':''},reads=True);

fig.savefig(f'{figures_folder}/mouse_slideseq_composition_region_vs_sample.pdf',bbox_inches='tight')

In [None]:
tc.tl.annotation_coordinate(tdata,annotation_key='region',result_key='region_distance',sample_key='SampleID',max_distance=1000,delta_distance=10,sparse=False,);

In [None]:
fig = tc.pl.annotation_coordinate(tdata,annotation_key='region',coordinate_key=('region_distance','Region 2 (Muscularis)'),group_key='State',colors=region_colors);
fig.savefig(f'{figures_folder}/mouse_slideseq_coord_region_vs_state.pdf',bbox_inches='tight')

In [None]:
malignant_regions = [r for r in tdata.obs['region'].cat.categories if 'Malignant-like' in r]
for r in malignant_regions:
    fig = tc.pl.annotation_coordinate(tdata.query(f'State == "premalignant"'),annotation_key='region',coordinate_key=('region_distance',r),group_key='State',colors=region_colors);
    fig.savefig(f'{figures_folder}/mouse_slideseq_coord_region_vs_{helper.map_short(r,1,join="")}.pdf',bbox_inches='tight')

In [None]:
tdatas = { sample: tdata[df.index].copy() for sample, df in tdata[~tdata.obsm['Epithelial_programs'].isna().any(axis=1)].obs.groupby('SampleID') if sample == 'cont8_P'}
tdatas['cont8_P'].obs['-y'] = -tdatas['cont8_P'].obs['y']
fig = tc.pl.scatter(tdatas,'region',position_key=['-y','x'],colors=region_colors,joint=True,point_size=puck_point_size,noticks=True,show_only=['Region 3 (Normal)','Region 10 (Normal)','Region 12 (Normal)','Region 5 (Stem cell niche - normal)','Region 2 (Muscularis)']);
fig.savefig(f'{figures_folder}/mouse_slideseq_normal_tissue_regions.pdf',bbox_inches='tight')

In [None]:
# sort beads by number of reads
tdata_sorted = tdata[pd.Series(tc.sum(tdata.X,axis=1),index=tdata.obs.index).sort_values(ascending=False).index].copy()
# plot heatmap for genes split as normal region DEGs with vbeads labeled according to region
fig = tc.pl.annotated_heatmap(
    adata=tdata_sorted[tdata_sorted.obs['region'].str.contains('ormal')],
    obs_key='region',
    var_key='region',
    obs_colors=region_colors,
    var_colors=region_colors,
    n_genes=20,
    axsize=(4,4),
)
fig.savefig(f'{figures_folder}/mouse_slideseq_normal_tissue_regions_marker.pdf',bbox_inches='tight')

In [None]:
tdatas = { sample: tdata[df.index] for sample, df in tdata.obs.groupby('SampleID') if sample in ['AV10a_P','AV11a_P'] }
fig = tc.pl.scatter(tdatas,'region',joint=True,point_size=puck_point_size, colors=mostly_gray_region_colors);
fig.savefig(f'{figures_folder}/mouse_slideseq_malignant_tissue_regions.pdf', bbox_inches='tight')

In [None]:
# sort beads by number of reads
tdata_sorted = tdata[pd.Series(tc.sum(tdata.X,axis=1),index=tdata.obs.index).sort_values(ascending=False).index].copy()
# plot heatmap for genes split as malignant region DEGs with vbeads labeled according to region
fig = tc.pl.annotated_heatmap(
    adata=tdata_sorted[tdata_sorted.obs['region'].str.contains('Malignant-like')],
    obs_key='region',
    var_key='region',
    obs_colors=region_colors,
    var_colors=region_colors,
    n_genes=20,
    axsize=(4,4),
)
fig.savefig(f'{figures_folder}/mouse_slideseq_malignant_tissue_regions_marker_all.pdf',bbox_inches='tight')

In [None]:
# sort vbeads by number of reads
pucks_by_compartment_sorted = pucks_by_compartment[pd.Series(tc.sum(pucks_by_compartment.X,axis=1),index=pucks_by_compartment.obs.index).sort_values(ascending=False).index].copy()
#for compartment
# plot heatmap for genes split as malignant region DEGs with vbeads labeled according to region
fig = tc.pl.annotated_heatmap(
    adata=pucks_by_compartment_sorted[pucks_by_compartment_sorted.obs['compartment'].isin(['epithelial']) & pucks_by_compartment_sorted.obs['region'].str.contains('Malignant-like')],
    obs_key='region',
    var_key='region',
    obs_colors=region_colors,
    var_colors=region_colors,
    n_genes=50,
    axsize=(4,4),
)
fig.savefig(f'{figures_folder}/mouse_slideseq_malignant_tissue_regions_marker_epithelial.pdf',bbox_inches='tight')
# plot heatmap for genes split as malignant region DEGs with vbeads labeled according to region
fig = tc.pl.annotated_heatmap(
    adata=pucks_by_compartment_sorted[pucks_by_compartment_sorted.obs['compartment'].isin(['immune']) & pucks_by_compartment_sorted.obs['region'].str.contains('Malignant-like')],
    obs_key='region',
    var_key='region',
    obs_colors=region_colors,
    var_colors=region_colors,
    n_genes=50,
    axsize=(4,4),
)
fig.savefig(f'{figures_folder}/mouse_slideseq_malignant_tissue_regions_marker_immune.pdf',bbox_inches='tight')
# plot heatmap for genes split as malignant region DEGs with vbeads labeled according to region
fig = tc.pl.annotated_heatmap(
    adata=pucks_by_compartment_sorted[pucks_by_compartment_sorted.obs['compartment'].isin(['stromal']) & pucks_by_compartment_sorted.obs['region'].str.contains('Malignant-like')],
    obs_key='region',
    var_key='region',
    obs_colors=region_colors,
    var_colors=region_colors,
    n_genes=50,
    axsize=(4,4),
)
fig.savefig(f'{figures_folder}/mouse_slideseq_malignant_tissue_regions_marker_stromal.pdf',bbox_inches='tight')

In [None]:
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(tdata,'Epithelial_programs','split_covered_regionState',sample_key='SampleID_split',**enrichment_method,reads=True,)
tdata_enrichments.rename(columns={'split_covered_regionState':'regionState'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'Epithelial_programs','State',sample_key='SampleID',**enrichment_method,reads=True,)
tdata_enrichments['State'] = tdata_enrichments['regionState']
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=['normal (reference)', 'normal', 'AV: normal-like', 'premalignant (reference)', 'AV: malignant-like'], ordered=True))

sorting_enr = enrichments.query(f'State=="normal (reference)" & enrichment=="enriched"').set_index('Epithelial_programs')[p_key]
sorting_dep = enrichments.query(f'State=="normal (reference)" & enrichment!="enriched"').set_index('Epithelial_programs')[p_key]
program_order = pd.DataFrame({'enr':sorting_enr,'dep':sorting_dep}).sort_values(['enr','dep'],ascending=[True,False]).index.to_numpy()
enrichments['Epithelial_programs'] = enrichments['Epithelial_programs'].astype(pd.CategoricalDtype(categories=program_order, ordered=True))

for annotate_pvalues in [False]:
    fig = tc.pl.significances(enrichments, p_key, 'Epithelial_programs', 'State', annotate_pvalues=annotate_pvalues);

    fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_program_vs_regionState{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
state_reference = reference[reference.obs['State'].isin(['normal','premalignant'])]

tdata_enrichments = tc.tl.enrichments(tdata,'cluster3','split_covered_regionState',sample_key='SampleID_split',**enrichment_method,reads=True,)
tdata_enrichments.rename(columns={'split_covered_regionState':'regionState','cluster3':'cluster'}, inplace=True)
reference_enrichments = tc.tl.enrichments(state_reference,'cluster','State',sample_key='SampleID',**enrichment_method,reads=True,)
tdata_enrichments['State'] = tdata_enrichments['regionState']
reference_enrichments['State'] = reference_enrichments['State'].map(lambda x: f'{x} (reference)')
enrichments = pd.concat([reference_enrichments,tdata_enrichments])
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=['AV: malignant-like', 'AV: normal-like', 'normal', 'premalignant (reference)', 'normal (reference)'], ordered=True))
enrichments['cluster'] = enrichments['cluster'].astype(pd.CategoricalDtype(categories=cluster_colors.keys(), ordered=True))

for annotate_pvalues in [False]:
    fig = tc.pl.significances(enrichments, p_key, 'cluster', 'State', annotate_pvalues=annotate_pvalues);

    fig.savefig(f'{figures_folder}/mouse_slideseq_enrichment_cluster3_vs_regionState{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
# sort vbeads by number of reads
lfdata = tc.tl.merge_observations(pucks_by_cluster, obs_index_key='index', annotation_key='labels', min_counts=10)
lfdata_sorted = lfdata[pd.Series(tc.sum(lfdata.X,axis=1),index=lfdata.obs.index).sort_values(ascending=False).index].copy()
# make and export a dotplot of marker genes for a single region
markers = ['Sparc','Ctss','Mmp12','Cxcl9','Chd5','Mcam','Ctsd']
dotplot = sc.pl.dotplot(lfdata_sorted[lfdata_sorted.obs['region'].str.contains('6')], markers, groupby='labels', dendrogram=False, log=True, return_fig=True)
dotplot.savefig(f'{figures_folder}/mouse_slideseq_region6_labels_marker_dot.pdf',bbox_inches='tight')

In [None]:
# sort vbeads by number of reads
pucks_by_compartment_sorted = pucks_by_compartment[pd.Series(tc.sum(pucks_by_compartment.X,axis=1),index=pucks_by_compartment.obs.index).sort_values(ascending=False).index].copy()
# make and export a dotplot of marker genes for a compartment
markers = ['Vim','Prox1','Sox11']
dotplot = sc.pl.dotplot(pucks_by_compartment_sorted[pucks_by_compartment_sorted.obs['compartment'].isin(['epithelial']) & pucks_by_compartment_sorted.obs['region'].str.contains('Malignant-like')], markers, groupby='region', dendrogram=False, log=True, return_fig=True)
dotplot.savefig(f'{figures_folder}/mouse_slideseq_malignantregions_marker_dot.pdf',bbox_inches='tight')

In [None]:
max_distance = 20
n_permutation = 10

what = 'cluster'
center = 'cluster'

state_tdatas = { state: tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }

comps = tc.tl.get_compositions(tdata,what,'State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'
for state, adata in state_tdatas.items():
    adata.obsm[what] = adata.obsm[what][big_clusters[state]]
    tc.tl.co_occurrence_matrix(adata=adata, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )
    fig = tc.pl.co_occurrence_matrix({state:adata}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

In [None]:
fig,axs=tc.pl.subplots(axsize=(3,3))
bins = np.arange(
    np.floor(min([ adata.uns['cluster-cluster']['z'].min() for adata in state_tdatas.values() ])),
    np.ceil(max([ adata.uns['cluster-cluster']['z'].max() for adata in state_tdatas.values() ]))+1,
    1)

triu = np.triu_indices_from(state_tdatas['normal'].uns['cluster-cluster']['z'][:,:,0])
normal_zs = state_tdatas['normal'].uns['cluster-cluster']['z'][:,:,0][triu]
triu = np.triu_indices_from(state_tdatas['premalignant'].uns['cluster-cluster']['z'][:,:,0])
premalignant_zs = state_tdatas['premalignant'].uns['cluster-cluster']['z'][:,:,0][triu]

axs[0,0].hist(normal_zs,bins=bins,alpha=0.5,label='normal');
axs[0,0].hist(premalignant_zs,bins=bins,alpha=0.5,label='premalignant');
axs[0,0].legend()
axs[0,0].set_xlabel('$\\frac{log(N(annotation,center))-random expectation}{standard deviation}$')
axs[0,0].set_title(f'interval (0,{max_distance}): cluster-cluster')
fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{max_distance}umP{n_permutation}_hist.pdf',bbox_inches='tight')
scipy.stats.mannwhitneyu(normal_zs, premalignant_zs, alternative='greater')

In [None]:
max_distance = 40
n_permutation = 10

what = 'cluster'
center = 'cluster'

state_tdatas = { state: tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }

comps = tc.tl.get_compositions(tdata,what,'State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'
for state, adata in state_tdatas.items():
    adata.obsm[what] = adata.obsm[what][big_clusters[state]]
    tc.tl.co_occurrence_matrix(adata=adata, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )
    fig = tc.pl.co_occurrence_matrix({state:adata}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

In [None]:
max_distance = 60
n_permutation = 10

what = 'cluster'
center = 'cluster'

state_tdatas = { state: tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }

comps = tc.tl.get_compositions(tdata,what,'State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'
for state, adata in state_tdatas.items():
    adata.obsm[what] = adata.obsm[what][big_clusters[state]]
    tc.tl.co_occurrence_matrix(adata=adata, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )
    fig = tc.pl.co_occurrence_matrix({state:adata}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

In [None]:
max_distance = 20
n_permutation = 5

what = 'cluster'
center = 'cluster'

state_tdatas = { state: tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }

comps = tc.tl.get_compositions(tdata,what,'State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'
for state, adata in state_tdatas.items():
    adata.obsm[what] = adata.obsm[what][big_clusters[state]]
    tc.tl.co_occurrence_matrix(adata=adata, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )
    fig = tc.pl.co_occurrence_matrix({state:adata}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

In [None]:
max_distance = 20
n_permutation = 50

what = 'cluster'
center = 'cluster'

state_tdatas = { state: tdata[df.index].copy() for state, df in tdata.obs.groupby('State') if len(df) > 0 }

comps = tc.tl.get_compositions(tdata,what,'State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'
for state, adata in state_tdatas.items():
    adata.obsm[what] = adata.obsm[what][big_clusters[state]]
    tc.tl.co_occurrence_matrix(adata=adata, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )
    fig = tc.pl.co_occurrence_matrix({state:adata}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

In [None]:
tdatas = {s:tdata[df.index].copy() for s,df in tdata[tdata.obs['epi_domain'] & (tc.sum(tdata.X,axis=1)>=50)].obs.groupby('State') if len(df) > 0 and s == 'normal'}
analysis_key=f'cluster-cluster'
for s,_tdata in tdatas.items():
    tc.tl.co_occurrence(_tdata, 'cluster', sample_key='SampleID', delta_distance=20, max_distance=1000, sparse=False, result_key=analysis_key, verbose=0);

In [None]:
comps = tc.tl.get_compositions(tdata,'cluster','State',reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_clusters = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

for state, adata in tdatas.items():
    main_Epi = ['Epi01 (Dysplastic Stem Like)', 'Epi02 (Enterocytes)', 'Epi03 (Stem/Progenitors)', 'Epi04 (Secretory)', 'Epi05 (Dysplastic Secretory Like)',]
    main_Epi = [ c for c in main_Epi if c in big_clusters[state] ]
    fig = tc.pl.co_occurrence(adata, analysis_key, score_key='log_occ', colors=cluster_colors, wspace=0.3, log_base=2, show_only=main_Epi, show_only_center=main_Epi, merged=False);
    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_{state}_epi_cluster_from_epi_cluster.pdf',bbox_inches='tight')

In [None]:
max_distance = 20
n_permutation = 10

what,center,group = 'Epithelial_programs','Epithelial_programs','State'

tdata_premalignant = tdata.query(f'State=="premalignant"').copy()

comps = tc.tl.get_compositions(tdata_premalignant,what,group,reads=True)
comps = comps.T.groupby(lambda x: re.match('^[A-Za-z]+',x).group(0)).apply(lambda x: x/x.sum(axis=0).to_numpy())
big_what = { c: comps[c][comps[c]>0.01].index.to_numpy() for c in comps.columns }

analysis_key=f'{what}-{center}'

tdata_premalignant.obsm[what] = tdata_premalignant.obsm[what][big_what['premalignant']]
tc.tl.co_occurrence_matrix(adata=tdata_premalignant, annotation_key=what, center_key=center, sample_key='SampleID',
    distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
    result_key=analysis_key, verbose=0,
    n_permutation=n_permutation,
)
fig = tc.pl.co_occurrence_matrix({'premalignant':tdata_premalignant}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5, group_cluster=True, value_cluster=True);
fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_premalignant_{max_distance}umP{n_permutation}_clustered.pdf',bbox_inches='tight')

In [None]:
tdata_premalignant = tdata[tdata.obs['epi_domain'] & (tc.sum(tdata.X,axis=1)>=50) & (tdata.obs['State'] == 'premalignant')].copy()
analysis_key=f'cluster-labels'
tc.tl.co_occurrence(tdata_premalignant, 'cluster', 'labels', sample_key='SampleID', delta_distance=20, max_distance=1000, sparse=False, result_key=analysis_key, verbose=0);

In [None]:
fig = tc.pl.co_occurrence(tdata_premalignant, analysis_key, score_key='log_occ', colors=cluster_colors, wspace=0.3, log_base=2, show_only=['Mono02 (Dysplasia-Associated)', 'Mono03 (Dysplasia-Associated, IFN)', 'Gran01', 'Gran02',], show_only_center=['Endo'], merged=False);
fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_premalignant_myeloid_cluster_from_endothelial.pdf',bbox_inches='tight')

In [None]:
max_distance = 20
n_permutation = 10

what = 'labels'
center = 'labels'

malignant_regions = [t for t in region_colors.keys()  if 'Malignant-like' in t]

for region in malignant_regions:
    tdata_premalignant = tdata.query(f'State=="premalignant" & region=="{region}"').copy()

    analysis_key=f'{what}-{center}'
    tc.tl.co_occurrence_matrix(adata=tdata_premalignant, annotation_key=what, center_key=center, sample_key='SampleID',
        distance_key=None, position_key=('x','y'), max_distance=max_distance,numba_blocksize=max_distance,
        result_key=analysis_key, verbose=0,
        n_permutation=n_permutation,
    )

    fig = tc.pl.co_occurrence_matrix(tdata_premalignant, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=1.5, value_cluster=False, group_cluster=False);

    fig.savefig(f'{figures_folder}/mouse_slideseq_cooc_premalignant_labels_in_{helper.map_short(region,1,join="")}_{max_distance}umP{n_permutation}.pdf',bbox_inches='tight')

# enrichments for split data

In [None]:
tc.utils.merge_annotation(pucks_by_compartment, annotation_key='compartment', result_key='epiornotepi', mapping={'notepithelial':['immune', 'stromal']})
egroups = ['epithelial','notepithelial']
pucks_by_epithelial = tc.tl.merge_observations(pucks_by_compartment, obs_index_key='index', annotation_key='epiornotepi')

In [None]:
comparisons = pd.DataFrame([
    ('malignant_regions','region',[r for r in region_colors.keys() if 'Malignant-like' in r],pucks_by_epithelial,'epiornotepi',egroups),
],columns=['name','rkey','regions','adata','key','groups'])

In [None]:
def check_GO_findings(results,group):
    findings_GO = pd.DataFrame([
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'inflam'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'tumor necrosis factor'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'interleukin-1'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'interferon-gamma'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'neutrophil chemotaxis'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'monocyte chemotaxis'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'lymphocyte chemotaxis'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'collagen'), # XXX
        ('notepithelial', 'enriched', 'Region 8 (Malignant-like, Deep crypt-like)', 'B cell activation'), # XXX
        ('notepithelial', 'enriched', 'Region 8 (Malignant-like, Deep crypt-like)', 'B cell receptor'), # XXX
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'oncostatin-M receptor complex'), # XXX
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'MHC class I'), # XXX
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'actin cytoskeleton'), # XXX,
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'actin filament'), # XXX
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'actin binding'), # XXX
    ],columns=['group','enriched','where','what',])
    for finding in findings_GO.itertuples():
        if group == finding.group:
            found = results[results['group'].isin([finding.where]) & results['finding'].isin([finding.enriched]) & results['GO_name'].str.contains(finding.what)]
            foundit = len(found) > 0
            if foundit:
                print(f'XXX {finding.what} {finding.enriched} in {finding.group}/{finding.where}')
            else:
                print(f'--- {finding.what} {finding.enriched} in {finding.group}/{finding.where}')
            print(found[['group','p_value','GO_name']])
def check_GENE_findings(enrichments, group):
    findings_GO = pd.DataFrame([
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'Sparc'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'Ctss'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'Mmp12'), # XXX
        ('notepithelial', 'enriched', 'Region 6 (Malignant-like, inflammation)', 'Cxcl9'), # XXX
        ('notepithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'Osmr'), # XXX
        ('epithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'Vim'), # XXX
        ('epithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'Prox1'), # XXX
        ('epithelial', 'enriched', 'Region 11 (Malignant-like, EMT)', 'Sox11'), # XXX

    ],columns=['group','enriched','where','what',])
    for finding in findings_GO.itertuples():
        if group == finding.group:
            found = enrichments[enrichments['region'].isin([finding.where]) & enrichments['enrichment'].isin([finding.enriched]) & enrichments['GENE'].isin([finding.what])]
            foundit = len(found) > 0
            if foundit:
                print(f'XXX {finding.what} {finding.enriched} in {finding.group}/{finding.where}')
            else:
                print(f'--- {finding.what} {finding.enriched} in {finding.group}/{finding.where}')
            print(found[['region','p_fisher_fdr_bh','GENE']])

In [None]:
for comp in comparisons.itertuples():
    print(comp.name, comp.key)
    comp_adata = comp.adata[comp.adata.obs[comp.rkey].isin(comp.regions)]
    
    comp_adata = comp_adata[~comp_adata.obs['State'].isin(['normal'])]
    
    for group in comp.groups:
        _sdata = comp_adata[comp_adata.obs[comp.key] == group].copy()
        
        print(group, len(_sdata))
        if not pd.Series(comp.regions).isin(_sdata.obs[comp.rkey].unique()).all():
            print(f'not all regions populated, continuing with next iteration...')
            continue
        
        _sdata.obs[comp.rkey] = _sdata.obs[comp.rkey].astype('category')
        _sdata.obs["SampleID"] = _sdata.obs["SampleID"].astype('category')
        
        enrichments,results = helper.marker_genes(_sdata, comp.rkey, goa_working_directory=f'{get_path("resources", "mouse_sc")}/goa')
        
        check_GENE_findings(enrichments,group)
        
        if results is not None and group != 'epithelial':
            check_GO_findings(results,group)
            
            results.to_csv(f'{figures_folder}/mouse_slideseq_GO_6811_{group}.csv',index=False)

# EMT score

In [None]:
EMT_Puram17_genes = pd.Index(['SERPINE1','TGFBI','MMP10','LAMC2','P4HA2','PDPN','ITGA5','LAMA3','CDH13','TNC','MMP2','EMP3','INHBA','LAMB3','VIM','SEMA3C','PRKCDBP','ANXA5','DHRS7','ITGB1','ACTN1','CXCR7','ITGB6','IGFBP7','THBS1','PTHLH','TNFRSF6B','PDLIM7','CAV1','DKK3','COL17A1','LTBP1','COL5A2','COL1A1','FHL2','TIMP3','PLAU','LGALS1','PSMD2','CD63','HERPUD1','TPM1','SLC39A14','C1S','MMP1','EXT2','COL4A2','PRSS23','SLC7A8','SLC31A2','ARPC1B','APP','MFAP2','MPZL1','DFNA5','MT2A','MAGED2','ITGA6','FSTL1','TNFRSF12A','IL32','COPB2','PTK7','OCIAD2','TAX1BP3','SEC13','SERPINH1','TPM4','MYH9','ANXA8L1','PLOD2','GALNT2','LEPREL1','MAGED1','SLC38A5','FSTL3','CD99','F3','PSAP','NMRK1','FKBP9','DSG2','ECM1','HTRA1','SERINC1','CALU','TPST1','PLOD3','IGFBP3','FRMD6','CXCL14','SERPINE2','RABAC1','TMED9','NAGK','BMP1','ESYT1','STON2','TAGLN','GJA1']) # genes from https://www.cell.com/action/showPdf?pii=S0092-8674%2817%2931270-9 Table S7 p-EMT
EMT_Puram17_genes = (EMT_Puram17_genes.str.get(0) + EMT_Puram17_genes.str.lower().str.split('',2).str.get(-1))[:50].intersection(pucks_by_epithelial.var.index) # lowercase for mouse

EMT_Marjanovic20_genes = pd.Index(['Stmn2','Klhl1','Rgs17','Ifi211','Bmp7','Fam19a5','Zeb2','Dlx2','Qrfp','Plxdc2','Ncam1','Stac','Prl2c3','Flt4','Sdc3','Ifi205','Ifi204','3110039M20Rik','Pilra','Clip3','Scrn1','Cd109','Dkk3','Foxg1','Sox11','Uchl1','Gm26716','Dynap','Csn3','Edil3','Rgs20','Col5a1','Ndst3','Pcdh9','Fabp7','Mndal','AC166344.1','Pilrb2','Sp7','Flnc','Cdh6','AC161275.1','Rbfox1','Smoc2','Olfm2','Axl','Piezo2','Layn','Vim','Acan','Arap3','Colec12','Fgfr1','Nes','Twist1','Rgs5','Dbn1','Igf2bp1','Fscn1','Tmem200a','Hey1','Gm28875','Cdh2','Dpysl3','Parvb','H2-M1','Cnksr2','Slitrk5','Cttnbp2','Adgra1','Syn1','Itga5','Fst','Efcc1','Cacna2d1','Pilrb1','Rgs2','Klra4','Ifitm10','Timp1','Khdrbs3','Mcub','Ltbp1','Chst11','Upp1','Loxl2','Sept6','Erc2','Zfhx4','Oaf','Syne3','Gng11','Rbpms2','Inhba','Pkp1','Sh3kbp1','Ggta1','Lgr6','Tnc',])
EMT_Marjanovic20_genes = EMT_Marjanovic20_genes[:50].intersection(pucks_by_epithelial.var.index)

signatures = pd.Series({
    'EMT_Puram17':EMT_Puram17_genes,
    'EMT_Marjanovic20':EMT_Marjanovic20_genes,
})

In [None]:
def scScore(adata, gene_list):
    tdata_cpy = adata.copy()
    sc.pp.filter_genes(tdata_cpy, min_cells=3)
    sc.pp.normalize_per_cell(tdata_cpy, counts_per_cell_after=1e4)
    sc.pp.log1p(tdata_cpy)
    sc.pp.scale(tdata_cpy)
    sc.tl.score_genes(tdata_cpy, gene_list, score_name='sc', random_state=42)
    return tdata_cpy.obs['sc']

In [None]:
for comp in pd.DataFrame([
    ('malignant_regions','region',[r for r in region_colors.keys() if 'Malignant-like' in r],pucks_by_epithelial,'epiornotepi',['epithelial']),
],columns=['name','rkey','regions','adata','key','groups']).itertuples():
    print(comp.key)
    comp_adata = comp.adata[comp.adata.obs[comp.rkey].isin(comp.regions)]
    
    for group in comp.groups:
        _sdata = comp_adata[comp_adata.obs[comp.key] == group].copy()
        print(group, len(_sdata))
        
        if not pd.Series(comp.regions).isin(_sdata.obs[comp.rkey].unique()).all():
            print(f'not all regions populated, continuing with next iteration...')
            continue
        _sdata.obs[comp.rkey] = _sdata.obs[comp.rkey].astype('category')
        _sdata.obs["SampleID"] = _sdata.obs["SampleID"].astype('category')
        
        for signature_name, signature_genes in signatures.items():
            
            print(signature_name)
            
            _sdata.obs[f'sc{signature_name}'] = scScore(_sdata,signature_genes.intersection(_sdata.var.index))
            common_args = {
                'adata':_sdata,
                'value_key':[
                    f'sc{signature_name}',
                ],
                'group_key':comp.rkey,
                'value_location':'obs',
                'reduction':None,
                'normalization':None,
                'fillna':None,
                'restrict_groups':comp.regions,
                'restrict_values':None,
                'reads':False,
                'min_obs': 100,
            }
            enrichments = tc.tl.enrichments(**common_args,
                p_corr='fdr_bh',
                method='mwu',
            )
            enrichments = enrichments[(enrichments['p_mwu_fdr_bh']<0.05)].sort_values(['region','value'])
            if len(enrichments) == 0:
                print(f'no significant enrichments, continuing with next iteration...')
                continue

            print(enrichments)