In [None]:
import pandas as pd
import numpy as np
import scipy
import scanpy as sc
from numba import njit
import matplotlib
import matplotlib.pyplot as plt
import re
import statsmodels.stats.multitest
import os
import csv
import tacco as tc

In [None]:
import sys
# Make helper functions available: The notebook expects to be executed either in the sub-workflow directory or in the notebooks directory
sys.path.insert(1, '../'), sys.path.insert(1, '../workflow/'); # prefer to look just one directory up
import helper
sys.path.pop(1), sys.path.pop(1);

get_path = helper.get_paths('mouse_sc')

# settings

## visualization settings

In [None]:
compartment_colors = helper.get_colors('compartment')
labels_colors = helper.get_colors('labels')
cluster_colors = helper.get_colors('cluster')
program_colors = helper.get_colors('program')

In [None]:
figures_folder = get_path('plots')
puck_point_size = 3

## analysis settings

In [None]:
enrichment_method = {'reduction':'sum','normalization':'clr','method':'welch','assume_counts':True,}
p_key = f'p_{enrichment_method["method"]}_fdr_bh'

# Load mouse data

In [None]:
reference = sc.read(f'{get_path("resources")}/scRNAseq.h5ad')

In [None]:
reference.obs['log_frac_tdTomato1p'] = np.log((reference.obs['tdTomato']+1)/tc.sum(reference.X,axis=1))

In [None]:
reference.obsm['Epithelial_programs_short'] = reference.obsm['Epithelial_programs'][[]]
for column in reference.obsm['Epithelial_programs'].columns:
    short_column = ' '.join(column.split()[:2])
    reference.obsm['Epithelial_programs_short'][short_column] = reference.obsm['Epithelial_programs'][column]

# Overall cell types

In [None]:
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(reference, 'cluster', position_key='X_umap', joint=True, colors=cluster_colors, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.scatter(reference, 'labels', position_key='X_umap', joint=True, colors=labels_colors, on_data_legend={});
fig.savefig(f'{figures_folder}/mouse_sc_umap_labels.pdf',bbox_inches='tight')

In [None]:
# split the data by state
state_splits = { state: reference[df.index] for state,df in reference.obs.groupby('State') }
# get smallest number of cells per state
min_cells = min(*[ adata.shape[0] for adata in state_splits.values() ])
# subsample the cells to the same number
state_splits = { state: adata[tc.utils.complete_choice(adata.obs.index, min_cells)] for state,adata in state_splits.items() }
# and plot them in the same plot
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(state_splits,'cluster', position_key='X_umap', joint=True, colors=cluster_colors, sharex=True, sharey=True, point_size=10, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_umap_cluster_by_state.pdf',bbox_inches='tight')

In [None]:
fig = tc.pl.contribution(reference, 'labels', 'State', normalization='sum', log=False, sample_key='SampleID', colors=labels_colors);
fig.savefig(f'{figures_folder}/mouse_sc_skyline_labels_vs_state.pdf',bbox_inches='tight')

In [None]:
fig = tc.pl.annotated_heatmap(
    adata=reference,
    obs_key='labels',
    n_genes=20,
    obs_colors=labels_colors,
    axsize=(8,8),
)
fig.savefig(f'{figures_folder}/mouse_sc_labels_marker.pdf',bbox_inches='tight')

In [None]:
fig = tc.pl.scatter(reference,'State', position_key='X_umap', joint=True);
fig.savefig(f'{figures_folder}/mouse_sc_umap_State.pdf',bbox_inches='tight')

In [None]:
fig = tc.pl.scatter(reference,'compartment', position_key='X_umap', joint=True);
fig.savefig(f'{figures_folder}/mouse_sc_umap_compartment.pdf',bbox_inches='tight')

In [None]:
fig = tc.pl.contribution(reference, 'cluster_short', 'State', normalization='sum', log=False, sample_key='SampleID', colors=helper.map_short(cluster_colors));
fig.savefig(f'{figures_folder}/mouse_sc_skyline_cluster_vs_state.pdf',bbox_inches='tight')

In [None]:
enrichments = tc.tl.enrichments(reference,'cluster_short','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=False,)
fig = tc.pl.significances(enrichments, p_key, 'cluster_short', 'State');
fig.savefig(f'{figures_folder}/mouse_sc_enrichment_cluster_vs_state.pdf',bbox_inches='tight')

# Cell types in subsets

In [None]:
sub_ref = reference[reference.obs['compartment']=='immune']
sub_col = { cat: cluster_colors[cat] for cat in pd.Index(sub_ref.obs['cluster'].cat.categories).intersection(sub_ref.obs['cluster'].unique()) }
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(sub_ref,'cluster', colors=sub_col, position_key='X_umap_Immune', joint=True, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_immune_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.contribution(sub_ref, 'cluster_short', 'State', colors=helper.map_short(sub_col), normalization='sum', log=False, sample_key='SampleID', );
fig.savefig(f'{figures_folder}/mouse_sc_immune_skyline_cluster_vs_state.pdf',bbox_inches='tight')
fig = tc.pl.annotated_heatmap(adata=sub_ref, obs_key='cluster_short', n_genes=20, obs_colors=helper.map_short(sub_col), axsize=(8,8),)
fig.savefig(f'{figures_folder}/mouse_sc_immune_cluster_marker.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['labels'].isin(['Mono','Mac'])]
sub_col = { cat: cluster_colors[cat] for cat in pd.Index(sub_ref.obs['cluster'].cat.categories).intersection(sub_ref.obs['cluster'].unique()) }
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(sub_ref,'cluster', colors=sub_col, position_key='X_umap_Myeloid', joint=True, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_myeloid_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.contribution(sub_ref, 'cluster_short', 'State', colors=helper.map_short(sub_col), normalization='sum', log=False, sample_key='SampleID', );
fig.savefig(f'{figures_folder}/mouse_sc_myeloid_skyline_cluster_vs_state.pdf',bbox_inches='tight')
fig = tc.pl.annotated_heatmap(adata=sub_ref, obs_key='cluster_short', n_genes=20, obs_colors=helper.map_short(sub_col), axsize=(8,8),)
fig.savefig(f'{figures_folder}/mouse_sc_myeloid_cluster_marker.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['labels'].isin(['Mono','Mac'])].copy()
tc.utils.log1p(sub_ref)
fig,axs = tc.pl.subplots(1,2,axsize=(6,2.5))
sc.pl.violin(sub_ref[sub_ref.obs['labels'].isin(['Mono'])], keys=['Vegfa'], groupby='State', ax=axs[0,0], stripplot=True, show=False, xlabel=None)
sc.pl.violin(sub_ref[sub_ref.obs['labels'].isin(['Mac'])], keys=['Vegfa'], groupby='State', ax=axs[1,0], stripplot=True, show=False,)
axs[0,0].set_ylabel(f'Vegfa (Mono)')
axs[1,0].set_ylabel(f'Vegfa (Mac)')
fig.savefig(f'{figures_folder}/mouse_sc_myeloid_Vegfa.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['labels'] == 'TNK']
sub_col = { cat: cluster_colors[cat] for cat in sub_ref.obs['cluster'].unique() }
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(sub_ref,'cluster', colors=sub_col, position_key='X_umap_TNK', joint=True, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_tnk_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.contribution(sub_ref, 'cluster_short', 'State', colors=helper.map_short(sub_col), normalization='sum', log=False, sample_key='SampleID', );
fig.savefig(f'{figures_folder}/mouse_sc_tnk_skyline_cluster_vs_state.pdf',bbox_inches='tight')
fig = tc.pl.annotated_heatmap(adata=sub_ref, obs_key='cluster_short', n_genes=20, obs_colors=helper.map_short(sub_col), axsize=(8,8),)
fig.savefig(f'{figures_folder}/mouse_sc_tnk_cluster_marker.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['labels'] == 'TNK']
enrichments = tc.tl.enrichments(sub_ref,'cluster','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=False,)
fig = tc.pl.significances(enrichments, p_key, 'cluster', 'State');
fig.savefig(f'{figures_folder}/mouse_sc_tnk_enrichment_cluster_vs_state.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment']=='stromal']
sub_col = { cat: cluster_colors[cat] for cat in pd.Index(sub_ref.obs['cluster'].cat.categories).intersection(sub_ref.obs['cluster'].unique()) }
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(sub_ref,'cluster', colors=sub_col, position_key='X_umap_Stromal', joint=True, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_stromal_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.contribution(sub_ref, 'cluster_short', 'State', colors=helper.map_short(sub_col), normalization='sum', log=False, sample_key='SampleID', );
fig.savefig(f'{figures_folder}/mouse_sc_stromal_skyline_cluster_vs_state.pdf',bbox_inches='tight')
fig = tc.pl.annotated_heatmap(adata=sub_ref, obs_key='cluster_short', n_genes=20, obs_colors=helper.map_short(sub_col), axsize=(8,8),)
fig.savefig(f'{figures_folder}/mouse_sc_stromal_cluster_marker.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment']=='stromal']
enrichments = tc.tl.enrichments(sub_ref,'cluster','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=False,)
fig = tc.pl.significances(enrichments, p_key, 'cluster', 'State');
fig.savefig(f'{figures_folder}/mouse_sc_stromal_enrichment_cluster_vs_state.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment']=='epithelial']
sub_col = { cat: cluster_colors[cat] for cat in sub_ref.obs['cluster'].unique() }
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(sub_ref,'cluster', colors=sub_col, position_key='X_umap_Epithelial', joint=True, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_umap_cluster.pdf',bbox_inches='tight')
fig = tc.pl.contribution(sub_ref, 'cluster_short', 'State', colors=helper.map_short(sub_col), normalization='sum', log=False, sample_key='SampleID', );
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_skyline_cluster_vs_state.pdf',bbox_inches='tight')

#### enrichments for epi clusters

In [None]:
def check_GO_findings(results):
    findings_GO = pd.DataFrame([
        ('enriched', 'Epi', 'Wnt signaling'),
    ],columns=['enriched','where','what',])
    found_all = []
    for finding in findings_GO.itertuples():
        found = results[results['group'].str.contains(finding.where) & results['finding'].isin([finding.enriched]) & results['GO_name'].str.contains(finding.what)]
        foundit = len(found) > 0
        if foundit:
            print(f'XXX {finding.what} {finding.enriched} in {finding.where}')
        else:
            print(f'--- {finding.what} {finding.enriched} in {finding.where}')
        found_all.append(found)
    found_all = pd.concat(found_all, axis=0)
    print(found_all[['group','p_value','GO_name']])
def check_GENE_findings(enrichments):
    findings_GO = pd.DataFrame([
        ('enriched', 'Epi01', 'Axin2'),
        ('enriched', 'Epi01', 'Ascl2'),
        ('enriched', 'Epi01', 'Myc'),
        ('enriched', 'Epi01', 'Ccnd1'),
        ('enriched', 'Epi01', 'Lgr5'),
        ('enriched', 'Epi05', 'Lgr5'),
        ('enriched', 'Epi02', 'Muc3'),
        ('enriched', 'Epi02', 'Cdhr5'),
        ('enriched', 'Epi03', 'Muc3'),
        ('enriched', 'Epi01', 'Wnt6'),
        ('enriched', 'Epi01', 'Wnt10a'),
        ('enriched', 'Epi01', 'Fzd10'),
        ('enriched', 'Epi01', 'Dkk3'),
        ('enriched', 'Epi01', 'Wif1'),
        ('enriched', 'Epi01', 'Nkd1'),
        ('enriched', 'Epi01', 'Axin2'),
        ('enriched', 'Epi01', 'Notum'),
        ('enriched', 'Epi01', 'Sox4'),
        ('enriched', 'Epi01', 'Prox1'),
        ('enriched', 'Epi04', 'Muc2'),
        ('enriched', 'Epi04', 'Reg4'),
        ('enriched', 'Epi05', 'Muc2'),
        ('enriched', 'Epi05', 'Reg4'),
        ('enriched', 'Epi05', 'Ccl9'),
        ('enriched', 'Epi05', 'Mmp7'),
    ],columns=['enriched','where','what',])
    found_all = []
    for finding in findings_GO.itertuples():
        found = enrichments[enrichments['cluster_short'].str.contains(finding.where) & enrichments['enrichment'].isin([finding.enriched]) & enrichments['value'].isin([finding.what])]
        foundit = len(found) > 0
        if foundit:
            print(f'XXX {finding.what} {finding.enriched} in {finding.where}')
        else:
            print(f'--- {finding.what} {finding.enriched} in {finding.where}')
        found_all.append(found)
    found_all = pd.concat(found_all)
    print(found_all[['cluster_short','p_fisher_fdr_bh','value']])

In [None]:
sub_ref = reference[reference.obs['compartment']=='epithelial']
enr,gos = helper.marker_genes(sub_ref, 'cluster_short', goa_working_directory=f'{get_path("resources")}/goa')

In [None]:
check_GENE_findings(enr)

In [None]:
check_GO_findings(gos)

In [None]:
markers = ['Epcam','Cdh1','Muc3','Cdhr5','Wnt6','Wnt10a','Fzd10','Dkk3','Wif1','Nkd1','Axin2', 'Ascl2', 'Myc', 'Ccnd1', 'Lgr5','Notum','Sox4','Prox1','Muc2','Reg4','Ccl9','Mmp7','Ifitm3']
sube = enr[enr['value'].isin(markers)].copy()
marks = sube.set_index(['value','cluster_short'])['p_fisher_fdr_bh'].map(lambda x: 'significant').unstack(fill_value='not significant')
fig = tc.pl.dotplot(sub_ref, markers, 'cluster_short', log1p=True, marks=marks, marks_colors=pd.Series({'not significant': '#aaa','significant': '#000'},name='enrichment'), swap_axes=False);
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_marker_dot_extra_rotated.pdf',bbox_inches='tight')

In [None]:
markers = ['Epcam','Cdk1','Mki67','Lgr5','Axin2','Tacstd2','Sox11','Sox4','Ccl25','Alpi','Apoa1','Fabp2','Spdef','Atoh1','Klk1','Reg4','Dmbt1','Aqp8','Atp12a','Muc3','Chga','Scgn']
sube = enr[enr['value'].isin(markers)].copy()
marks = sube.set_index(['value','cluster_short'])['p_fisher_fdr_bh'].map(lambda x: 'significant').unstack(fill_value='not significant').reindex(columns=enr['cluster_short'].cat.categories,fill_value='not significant')
fig = tc.pl.dotplot(sub_ref, markers, 'cluster_short', log1p=True, marks=marks, marks_colors=pd.Series({'not significant': '#aaa','significant': '#000'},name='enrichment'), swap_axes=False);
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_marker_dot_extra_ED_rotated.pdf',bbox_inches='tight')

In [None]:
epiref = tc.utils.preprocess_single_cell_data(sub_ref)
epiref.obs['PC1'], epiref.obs['PC2'] = epiref.obsm['X_pca'][:,:2].T
large_epi_cluster = epiref.obs['cluster'].cat.categories[:5] # only show the 5 largest clusters
epiref = epiref[epiref.obs['cluster'].isin(large_epi_cluster)]
fig = tc.pl.scatter(epiref, 'cluster', position_key=['PC1','PC2'], noticks=True, axes_labels=['PC1','PC2'], point_size=1, colors={k:v for k,v in cluster_colors.items() if k in large_epi_cluster}, );
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_PC.pdf',bbox_inches='tight')

### epithelial program weights 

In [None]:
profile_key = 'Epithelial_programs'
profiles = reference.varm[profile_key]
profile_names = profiles.columns
profile_names = [
    'Program 2 (Enteroendocrine)',
    'Program 3 (Proliferation (G2/M))',
    'Program 4 (Wnt signaling)',
    'Program 5 (Transmembrane transport/Basolateral plasma membrane)',
    'Program 6 (Inflammatory response)',
    'Program 7 (Innate immune response)',
    'Program 8 (Apical plasma membrane)',
    'Program 11 (Proliferation (G1/S))',
    'Program 14 (Angiogenesis)',
    'Program 16 (Stem cells)',
] # uses only selected programs
n_genes = 30

fig,axs=tc.pl.subplots(len(profile_names),axsize=(2,5), x_padding=1.0)

for i,c in enumerate(profile_names):
    top_genes = profiles[c].sort_values(ascending=False).head(n_genes)
    top_genes = top_genes[::-1]
    for y,x in zip(np.arange(len(top_genes)), top_genes.to_numpy()):
        axs[0,i].barh(y, x, height=0.8, color='#555', linewidth=0)
    axs[0,i].set_yticks(np.arange(len(top_genes)))
    axs[0,i].set_yticklabels(top_genes.index)
    axs[0,i].set_title(' '.join(c.split(' ',2)[:2]) + '\n' + c.split(' ',2)[-1])
    axs[0,i].spines['right'].set_visible(False)
    axs[0,i].spines['top'].set_visible(False)
    axs[0,i].set_ylim(-0.6, n_genes-1 + 0.6)

fig.savefig(f'{figures_folder}/mouse_sc_epithelial_program_weights.pdf', bbox_inches='tight')

In [None]:
fig = tc.pl.contribution(reference[reference.obs['compartment']=='epithelial'], 'Epithelial_programs_short', 'State', normalization='sum', log=False, sample_key='SampleID', colors=helper.map_short(program_colors,1));
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_skyline_program_vs_state.pdf',bbox_inches='tight')

# CONTINUE HERE

### epithelial stem marker

In [None]:
# select epithelial "stem" cells
stem_data = reference[~reference.obsm['Epithelial_programs'].isna().any(axis=1)]
cs = stem_data.obsm['Epithelial_programs']['Program 16 (Stem cells)'].sort_values(ascending=False).cumsum()
stem_data = stem_data[cs[cs < 0.9 * cs.tail(1).to_numpy()[0]].index].copy() # take 90
#stem_data = stem_data[stem_data.obsm['Epithelial_programs']['program_16']>0].copy()
tc.utils.merge_annotation(stem_data,'State', {'WT':'normal','AV&AKPV': ['premalignant','malignant (3weeks)','malignant (9weeks)']},result_key='CoarseState')
stem_data.obs['CoarseState'] = stem_data.obs['CoarseState'].astype(pd.CategoricalDtype(['WT','AV&AKPV'],ordered=True))
print(f'There are {len(stem_data)} cells with high stemness selected.')

In [None]:
# find top 50 marker genes
enr,gos = helper.marker_genes(stem_data, 'CoarseState', goa_working_directory=f'{get_path("resources")}/goa')
genes = np.concatenate([
    enr[(enr['CoarseState']!='WT')].sort_values('p_fisher_fdr_bh',ascending=True)['value'].to_numpy()[:50],
    enr[(enr['CoarseState']=='WT')].sort_values('p_fisher_fdr_bh',ascending=True)['value'].to_numpy()[:50],
])

In [None]:
genes = np.concatenate([
    enr[(enr['CoarseState']!='WT')].sort_values('p_fisher_fdr_bh',ascending=True)['value'].to_numpy()[:100],
    enr[(enr['CoarseState']=='WT')].sort_values('p_fisher_fdr_bh',ascending=True)['value'].to_numpy()[:100],
])

In [None]:
hit = gos[(gos['group']!='WT') & gos['GO_name'].isin(['negative regulation of Wnt signaling pathway'])]
if len(hit) == 0:
    print('enrichment not found')
else:
    print(hit['p_value'].astype(str))
    print(pd.Index(['Notum','Wif1','Nkd1']).isin(hit['genes'].iloc[0]))

In [None]:
hit = gos[(gos['group']!='WT') & gos['GO_name'].isin(['cellular response to interferon-gamma'])]
if len(hit) == 0:
    print('enrichment not found')
else:
    print(hit['p_value'].astype(str))
    print(pd.Index(['Ccl6','Ccl9']).isin(hit['genes'].iloc[0]))

In [None]:
hit = gos[(gos['group']!='WT') & gos['GO_name'].isin(['immune system process'])]
if len(hit) == 0:
    print('enrichment not found')
else:
    print(hit['p_value'].astype(str))
    print(pd.Index(['Ifitm1','Ifitm3']).isin(hit['genes'].iloc[0]))

In [None]:
fig = tc.pl.annotated_heatmap(stem_data[:,genes],'CoarseState',var_highlight=[
    'Ifitm1', 'Ifitm3', 'B2m', 'Pglyrp1', 'Nkd1', 'Wif1', 'Lcn2', 'Vim', 'H2-D1', 'Gapdh', 'Notum', 'Ifitm2', 'Actg1', 'H2-K1', 'Lgals3', 'H2-Q7', 'S100a8', 'Stxbp1'
],axsize=(5,7),trafo=True);
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_stem_marker.pdf',bbox_inches='tight')

## enrichments for Endo01 between states

In [None]:
sub_ref = reference[reference.obs['cluster']=='Endo01 (Vascular)'].copy()
sub_ref.obs['normality'] = (sub_ref.obs['State'] == 'normal').map({True:'normal',False:'dysplasic'}).astype('category')
enr,gos = helper.marker_genes(sub_ref, 'normality', goa_working_directory=f'{get_path("resources")}/goa')

In [None]:
enr.to_csv(f'{figures_folder}/mouse_sc_Endo01_vs_normality_DEG.csv',index=False)
gos.to_csv(f'{figures_folder}/mouse_sc_Endo01_vs_normality_GO.csv',index=False)

# Epithelial programs

In [None]:
sub_ref = reference[reference.obs['compartment'] == 'epithelial']
sub_col = { cat: cluster_colors[cat] for cat in sub_ref.obs['cluster'].unique() }
# split the data by state
state_splits = { state: sub_ref[df.index] for state,df in sub_ref.obs.groupby('State') }
# get smallest number of cells per state
min_cells = min(*[ adata.shape[0] for adata in state_splits.values() ])
# subsample the cells to the same number
state_splits = { state: adata[tc.utils.complete_choice(adata.obs.index, min_cells)] for state,adata in state_splits.items() }
# and plot them in the same plot
short_cluster = pd.Series(pd.Index(cluster_colors.keys()).map(helper.map_short), index=cluster_colors.keys(), )
fig = tc.pl.scatter(state_splits,'cluster', position_key='X_umap_Epithelial', joint=True, colors=sub_col, sharex=True, sharey=True, point_size=7, on_data_legend=short_cluster);
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_umap_cluster_by_state.pdf',bbox_inches='tight')

In [None]:
flowcells = ['HY2Y2CCXY','H55F7CCX2']
states = ['malignant (3weeks)','malignant (9weeks)']
compartments = ['epithelial']
ref_groups = [
    'Epi01 (Dysplastic Stem Like)',
    'Epi02 (Enterocytes)',
    'Epi03 (Stem/Progenitors)',
    'Epi04 (Secretory)',
    'Epi05 (Dysplastic Secretory Like)',
]
sub_ref = reference[reference.obs['Flowcell'].isin(flowcells) & reference.obs['State'].isin(states) & reference.obs['compartment'].isin(compartments) & reference.obs['cluster'].isin(ref_groups)]
enrichments = []
for ref_group in ref_groups:
    enr = tc.tl.enrichments(sub_ref, 'log_frac_tdTomato1p', 'cluster', reference_group=ref_group, method='welch')
    enr = enr[~enr['cluster'].str.endswith('VS rest')]
    enr['cluster'] = enr['cluster'].str.split(' VS ',1,expand=True)[0]
    enr.iloc[:,0] = f'VS {ref_group}'
    enrichments.append(enr)
enrichments = pd.concat(enrichments,axis=0)
enrichments['log_frac_tdTomato1p'] = enrichments['log_frac_tdTomato1p'].astype(pd.CategoricalDtype([f'VS {ref_group}' for ref_group in ref_groups],ordered=True))
enrichments['cluster'] = enrichments['cluster'].astype(pd.CategoricalDtype(ref_groups,ordered=True))
enrichments['p_welch_fdr_bh'] = statsmodels.stats.multitest.multipletests(enrichments['p_welch'], alpha=0.05, method='fdr_bh')[1]
fig = tc.pl.significances(enrichments, p_key='p_welch_fdr_bh', group_key='log_frac_tdTomato1p', value_key='cluster');#, value_key='tdTomato', group_key='cluster');
fig.get_axes()[0].set_title('tdTomato enrichment')
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_enrichment_tdTomato_vs_cluster.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment'] == 'epithelial'].copy()
tc.utils.log1p(sub_ref)
genes_of_interest = ['Reg4','Muc2','Mmp7','Ifitm3','Ccl9']
fig,axs = tc.pl.subplots(len(genes_of_interest), axsize=(5,3))
for gene,ax in zip(genes_of_interest,axs.flatten()):
    sc.pl.violin(sub_ref, keys=[gene], groupby='cluster_short', ax=ax, show=False, stripplot=False, rotation=90)
fig.savefig(f'{figures_folder}/mouse_sc_epithelial_marker_violin.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment'] == 'immune'].copy()
tc.utils.log1p(sub_ref)
genes_of_interest = ['Ccr1']
fig,axs = tc.pl.subplots(len(genes_of_interest), axsize=(8,3))
for gene,ax in zip(genes_of_interest,axs.flatten()):
    sc.pl.violin(sub_ref, keys=[gene], groupby='cluster_short', ax=ax, show=False, stripplot=False, rotation=90)
fig.savefig(f'{figures_folder}/mouse_sc_immune_marker_violin.pdf',bbox_inches='tight')

In [None]:
sub_ref = reference[reference.obs['compartment'] == 'epithelial']
for short, program, genes_of_interest in [
    ['prog2', 'Program 2 (Enteroendocrine)', ['Chga','Chgb','Scg5',], ],
    ['prog5', 'Program 5 (Transmembrane transport/Basolateral plasma membrane)', ['Slc6a14','Slc5a8',], ],
    ['prog8', 'Program 8 (Apical plasma membrane)', ['Car4','Clca4b',], ],
    ['prog4', 'Program 4 (Wnt signaling)', ['Wnt6','Wnt10a','Lef1',], ],
    ['prog14', 'Program 14 (Angiogenesis)', ['Flt1','Adam8','Pgf','Hbegf'], ],
    ['prog6', 'Program 6 (Inflammatory response)', ['Il1rn','Itgb6','Itgav','Cxcl5'], ],
    ['prog7', 'Program 7 (Innate immune response)', ['Ifit2','Ifit3','Gbp5','Gbp7',], ],
]:
    sub_ref.obsm[''] = pd.DataFrame(np.log1p(sub_ref[:,[genes_of_interest[0],*genes_of_interest]].X.toarray()),columns=[program,*genes_of_interest], index=sub_ref.obs.index)
    sub_ref.obsm[''][program] = sub_ref.obsm['Epithelial_programs'][program]
    fig,axs = tc.pl.subplots(len(genes_of_interest)+1, axsize=(4,3), x_padding=1.0)
    tc.pl.scatter(sub_ref, '', position_key='X_umap_Epithelial', ax=axs.T, joint=False, cmap='viridis', noticks=True)
    fig.savefig(f'{figures_folder}/mouse_sc_epithelial_umap_{short}.pdf',bbox_inches='tight')

In [None]:
enrichments = tc.tl.enrichments(reference,'Epithelial_programs','State',sample_key='SampleID',**enrichment_method,reference_group='normal',reads=True)

sorting_enr = enrichments.query(f'State=="normal VS rest" & enrichment=="enriched"').set_index('Epithelial_programs')[p_key]
sorting_dep = enrichments.query(f'State=="normal VS rest" & enrichment!="enriched"').set_index('Epithelial_programs')[p_key]
program_order = pd.DataFrame({'enr':sorting_enr,'dep':sorting_dep}).sort_values(['enr','dep'],ascending=[True,False]).index.to_numpy()
enrichments['Epithelial_programs'] = enrichments['Epithelial_programs'].astype(pd.CategoricalDtype(categories=program_order, ordered=True))

fig = tc.pl.significances(enrichments, p_key, 'Epithelial_programs', 'State');

fig.savefig(f'{figures_folder}/mouse_sc_epithelial_enrichment_program_vs_state.pdf',bbox_inches='tight')