# Score mouse regions in human sc data

In [None]:
import warnings
warnings.filterwarnings('ignore','invalid value encountered in true_divide')

import pandas as pd
import numpy as np
import anndata as ad
import scipy

import scanpy as sc
import tacco as tc

from matplotlib.colors import LinearSegmentedColormap

In [None]:
import sys
# Make helper functions available: The notebook expects to be executed either in the sub-workflow directory or in the notebooks directory
sys.path.insert(1, '../'), sys.path.insert(1, '../workflow/'); # prefer to look just one directory up
import helper
sys.path.pop(1), sys.path.pop(1);

get_path = helper.get_paths('human_sc')
figures_folder = get_path('plots')

# settings

## analysis settings

In [None]:
enrichment_method = {'reduction':'sum','normalization':'clr','method':'welch','assume_counts':True,}
p_key = f'p_{enrichment_method["method"]}_fdr_bh'

# Load data and convert the human data into mouse gene space

In [None]:
mouse_scrna = ad.read(f'{get_path("resources","mouse_sc")}/scRNAseq.h5ad')

In [None]:
mouse_pucks = ad.read(f'{get_path("resources","mouse_slideseq")}/slideseq.h5ad')
mouse_pucks_by_compartment = ad.read(f'{get_path("resources","mouse_slideseq")}/slideseq_by_compartment.h5ad')

In [None]:
pelka = ad.read(f'{get_path("data")}/Pelka.h5ad')
pelka = pelka[pelka.obs['PROCESSING_TYPE'].isin(['unsorted'])].copy() # subset to "unbiased" cell sampling

In [None]:
tc.tl.setup_orthology_converter(f'{get_path("resources")}/MGI/HOM_MouseHumanSequence.rpt');

In [None]:
pelka2mouse = tc.tl.run_orthology_converter(pelka, 'human', 'mouse', mouse_scrna.var.index, use_synonyms=True)

In [None]:
# use only data from sufficiently covered beads
mouse_pucks = mouse_pucks[mouse_pucks.X.sum(axis=1)>=100].copy()
mouse_pucks_by_compartment = mouse_pucks_by_compartment[mouse_pucks_by_compartment.obs['index'].isin(mouse_pucks.obs.index)].copy()

In [None]:
# prepare spatial sample split
tc.utils.split_spatial_samples(mouse_pucks, buffer_thickness=400, split_scheme=(2,2), sample_key='SampleID', result_key='SampleID_split', check_splits=False)
mouse_pucks_by_compartment.obs['SampleID_split'] = mouse_pucks_by_compartment.obs['index'].map(mouse_pucks.obs['SampleID_split'])

# compare TNK composition between human and mouse

In [None]:
tnk_mouse = mouse_scrna[mouse_scrna.obs['labels'].isin(['TNK'])].copy()
tnkilc_human = pelka2mouse[pelka2mouse.obs['clTopLevel'].isin(['TNKILC'])].copy()
tc.tl.annotate(tnk_mouse, tnkilc_human, 'cl295v11SubFull', result_key='cl295v11SubFull', method='OT', assume_valid_counts=True,bisections=4,max_annotation=1,verbose=0);

In [None]:
tnkilc_human.obs['cl295v11SubFull'] = tnkilc_human.obs['cl295v11SubFull'].astype('category')

clr_human_contr = tc.tl.get_contributions(tnkilc_human,'cl295v11SubFull','TMMR',sample_key='PID',reduction='sum',normalization='sum', reads=False, assume_counts=True)

clr_mouse_contr = tc.tl.get_contributions(tnk_mouse,'cl295v11SubFull','State',sample_key='SampleID',reduction='sum',normalization='sum', reads=False, assume_counts=True)

clr_contr = pd.concat({'mouse':clr_mouse_contr,'human':clr_human_contr,})

clr_contr_ad = ad.AnnData(clr_contr.to_numpy(),)

clr_contr_ad.obs['species'] = clr_contr.index.get_level_values(level=0).astype('category')
clr_contr_ad.obs['State'] = clr_contr.index.get_level_values(level=1).astype('category')
tc.utils.merge_annotation(clr_contr_ad, 'State', {'dysplastic':['premalignant','malignant (3weeks)','malignant (9weeks)']}, result_key='cState')
clr_contr_ad.obs['State'] = (clr_contr_ad.obs['cState'].astype(str) + ' (' + clr_contr_ad.obs['species'].astype(str) + ')').astype('category')

sc.pp.neighbors(clr_contr_ad)

sc.tl.umap(clr_contr_ad)

fig = tc.pl.scatter(clr_contr_ad,keys=['State',],position_key='X_umap',point_size=20,axsize=0.3,joint=True,margin=0.1,axes_labels=['UMAP1','UMAP2'], padding=0.7, noticks=False);
fig.get_axes()[0].set_title('embedding samples wrt. TNK compositions');

fig.savefig(f'{figures_folder}/human_sc_tnk_composition_umap.pdf',bbox_inches='tight')

In [None]:
max_distance = 2
n_permutation = 100

what = 'State'
center = 'State'

analysis_key=f'{what}-{center}'
tc.tl.co_occurrence_matrix(adata=clr_contr_ad, annotation_key=what, center_key=center,
    distance_key=None, position_key='X_umap', max_distance=max_distance, sparse=False,
    result_key=analysis_key, verbose=0,
    n_permutation=n_permutation,reads=False,
)

fig = tc.pl.co_occurrence_matrix({'':clr_contr_ad}, score_key='z', cmap_vmin_vmax=(-5,5), cmap='bwr', analysis_key=analysis_key, y_padding=3.5,axsize=(2,2),);

fig.get_axes()[0].set_title('z-score on neighbourships in TNK composition UMAP');

fig.savefig(f'{figures_folder}/human_sc_tnk_composition_occ.pdf',bbox_inches='tight')

# Epithelial program similarity between human and mouse

In [None]:
for program_name, filename in [('pEpi','EpiTGlobalv5ForceK43'),('pEpiTp','EpiTMSSv4ForceK32'),('pEpiTd','EpiTMSIv4ForceK29')]:
    program_file = f'{get_path("resources")}/Pelka/ccNMF_GeneWeight_Wmat/ccNMF_RawWeights_{filename}.csv.gz'
    
    weights = pd.read_csv(program_file, index_col=0)
    weights = weights.loc[weights.select_dtypes(include=np.number).sum(axis=1)>0]
    weights = weights.loc[weights.index.value_counts()[weights.index.value_counts() == 1].index]
    
    ## Translate human programs to mouse genes
    pr2m = tc.tl.run_orthology_converter(ad.AnnData(weights.T, dtype=np.float32), 'human', 'mouse', mouse_scrna.var.index, use_synonyms=True)
    rhp = pd.DataFrame(pr2m.X.T, index=pr2m.var.index, columns=pr2m.obs.index)

    pelka2mouse_sub = pelka2mouse.query(f'compartment=="epithelial"')
    ref_sub = mouse_scrna.query(f'compartment=="epithelial"')

    min_counts=100
    (pelka2mouse_sub,ref_sub,) = tc.pp.filter((pelka2mouse_sub,ref_sub,),min_counts_per_gene=min_counts,assume_valid_counts=True,)
    rmp = ref_sub.varm['Epithelial_programs'].loc[~ref_sub.varm['Epithelial_programs'].isna().any(axis=1)]
    good_genes = rmp.index.intersection(rhp.index)
    pelka2mouse_sub = pelka2mouse_sub[:,good_genes]
    ref_sub = ref_sub[:,good_genes]
    rhp = rhp.loc[good_genes].copy()
    rmp = rmp.loc[good_genes].copy()
    rhp /= (rhp.sum(axis=0).to_numpy() / 1e4)
    rmp /= (rmp.sum(axis=0).to_numpy() / 1e4)
    rhp = np.log1p(rhp)
    rmp = np.log1p(rmp)

    background = tc.sum(ref_sub.X,axis=0)
    background /= background.sum() / 1e4
    background = np.log1p(background)
    rmp -= background[:,None]
    background = tc.sum(pelka2mouse_sub.X,axis=0)
    background /= background.sum() / 1e4
    background = np.log1p(background)
    rhp -= background[:,None]

    concated = pd.concat([rhp,rmp],axis=1)
    s_corr = 1-tc.utils.cdist(concated.T,metric="correlation")[rhp.shape[1]:,:rhp.shape[1]]
    s_corr = pd.DataFrame(s_corr, index=rmp.columns, columns=rhp.columns)
    s_corr = s_corr.loc[(s_corr>0.4).any(axis=1),(s_corr>0.4).any(axis=0)]
    n_corr = s_corr / s_corr.max(axis=0).to_numpy()

    cmap_br = LinearSegmentedColormap.from_list('cmap_br', ['#4d85b5', '#ffffff', '#ff1217'])
    fig,axs = tc.pl.subplots(axsize=[0.25*s_corr.shape[0],0.25*s_corr.shape[1]])
    tc.pl.heatmap(s_corr,None,None,cmap=cmap_br,cmap_center=None, ax=axs[0,0], value_order='diag', group_labels_rotation=45);
    axs[0,0].set_title(f'correlation between human and mouse programs')
    fig.savefig(f'{figures_folder}/human_sc_{program_name}_similarity_with_mouse_epi_programs.pdf',bbox_inches='tight')

# human program associations across human and mouse samples

In [None]:
program_specs = pd.DataFrame(np.array([
    ['pEpiTd',f'labels=="Epi"',f'clTopLevel=="Epi"','EpiTMSIv4ForceK29'],
    ['pEpiTp',f'labels=="Epi"',f'clTopLevel=="Epi"','EpiTMSSv4ForceK32'],
    ['pTNI',f'labels=="TNK"',f'clTopLevel=="TNKILC"','T'],
    ['pM',f'labels=="Mono" | labels=="Mac"',f'clTopLevel=="Myeloid"','Myeloid'],
]),columns=['program_name','selector_mouse','selector_human','filename']).set_index('program_name')

In [None]:
prog_maps_mouse = {}
prog_maps_human = {}
for program_name in program_specs.index:
    print(program_name)
    selector_mouse = program_specs.loc[program_name,'selector_mouse']
    selector_human = program_specs.loc[program_name,'selector_human']
    filename = program_specs.loc[program_name,'filename']
    
    program_file = f'{get_path("resources")}/Pelka/ccNMF_GeneWeight_Wmat/ccNMF_RawWeights_{filename}.csv.gz'
    
    weights = pd.read_csv(program_file, index_col=0)
    weights = weights.loc[weights.select_dtypes(include=np.number).sum(axis=1)>0]
    weights = weights.loc[weights.index.value_counts()[weights.index.value_counts() == 1].index]
    
    ## Translate human programs to mouse genes
    pr2m = tc.tl.run_orthology_converter(ad.AnnData(weights.T, dtype=np.float32), 'human', 'mouse', mouse_scrna.var.index, use_synonyms=True)
    rhp = pd.DataFrame(pr2m.X.T, index=pr2m.var.index, columns=pr2m.obs.index)

    pelka2mouse_sub = pelka2mouse.query(selector_human).copy()
    ref_sub = mouse_scrna.query(selector_mouse)

    pelka2mouse_sub.varm[program_name] = rhp
    
    prog_names = pelka2mouse_sub.varm[program_name].columns
    n_progs = len(prog_names)
    # use flat prior
    annotation_prior=pd.Series(np.full(n_progs,1/n_progs),index=prog_names)
    
    prog_maps_mouse[program_name] = tc.tl.annotate(ref_sub, pelka2mouse_sub, program_name, method='OT', assume_valid_counts=True,bisections=4,min_counts_per_cell=50,verbose=0, annotation_prior=annotation_prior)

In [None]:
prog_comps_mouse = {}
for prog_name, prog_map in prog_maps_mouse.items():
    mouse_scrna.obsm[prog_name] = prog_map.reindex(mouse_scrna.obs.index)
    prog_comps_mouse[prog_name] = tc.tl.get_contributions(mouse_scrna.query('State != "normal"'),prog_name,'SampleID',reads=False, normalization='clr')

In [None]:
prog_comps_mouse = pd.concat(prog_comps_mouse.values(), axis=1, )
all_prog_comps = {'mouse_not_normal':prog_comps_mouse,}

# region-related analysis across human and mouse samples

In [None]:
compartment_pelka2mouse = { comp: pelka2mouse[df.index].copy() for comp, df in pelka2mouse.obs.groupby('compartment') }
## Get spatial data
compartment_pucks = { comp: mouse_pucks_by_compartment[df.index].copy() for comp, df in mouse_pucks_by_compartment.obs.groupby('compartment') }
## Get mouse reference
compartment_refs = { comp: mouse_scrna[df.index].copy() for comp, df in mouse_scrna.obs.groupby('compartment') }

In [None]:
##### Transfer region annotation
kw_args = {
    'method':'OT',
    'annotation_key':'region',
    'result_key':'region_anno',
    'verbose':0,
    'multi_center': 10,
    'bisections': 7,
    'assume_valid_counts': True,
}
for c in compartment_pucks.keys():
    %time tc.tl.annotate(compartment_pucks[c], compartment_pucks[c], **kw_args, ); # validation within pucks
    %time tc.tl.annotate(compartment_pelka2mouse[c], compartment_pucks[c], **kw_args, );
    %time tc.tl.annotate(compartment_refs[c], compartment_pucks[c], **kw_args, );
##### Write results
for label,datasets in zip(['pucks','reference','human'],[compartment_pucks,compartment_refs,compartment_pelka2mouse]):
    for c in compartment_pucks.keys():
        datasets[c].obsm['region_anno'].to_csv(f'{figures_folder}/{label}_region_anno_{c}.csv')

In [None]:
# Read per compartment region annotation
for label,datasets in zip(['pucks','reference','human'],[compartment_pucks,compartment_refs,compartment_pelka2mouse]):
    if label == 'human':
        continue
    for c in compartment_pucks.keys():
        df = pd.read_csv(f'{figures_folder}/{label}_region_anno_{c}.csv')
        datasets[c].obsm['region_anno'] = df.set_index(df.columns[0])
# organize data
specs = pd.DataFrame([[{},{},{},{}],['region','region_anno','region_anno','region_anno',],['SampleID_split','SampleID_split','SampleID','PatientTypeID'],['State','State','State','TMMR']],index=['datasets','region_key','sample_key','anno_key'],columns=['pucks','mapped','mouse','human']).T
for k in compartment_pucks.keys():
    specs.loc['pucks','datasets'][k] = compartment_pucks[k]
    specs.loc['mapped','datasets'][k] = compartment_pucks[k]
    specs.loc['mouse','datasets'][k] = compartment_refs[k]
    specs.loc['human','datasets'][k] = compartment_pelka2mouse[k]

In [None]:
def define_region_ordering(enrichments):
    ### sort regions according to median signed log significance
    enr_e = pd.pivot(enrichments[enrichments['enrichment']=='enriched'], 'region', 'State', p_key)
    enr_p = pd.pivot(enrichments[enrichments['enrichment']!='enriched'], 'region', 'State', p_key)
    small_value = 1e-300
    enr_e = np.maximum(enr_e,small_value)
    enr_p = np.maximum(enr_p,small_value)
    enr_p = enr_p.reindex_like(enr_e)
    enr = pd.DataFrame(np.where(enr_e < enr_p, -np.log(enr_e), np.log(enr_p)),index=enr_e.index,columns=enr_e.columns)
    return enr[enr.columns[enr.columns.str.startswith('normal')]].median(axis=1).sort_values().index.to_numpy()

In [None]:
selected_labels = ['human']
regions_dtype = specs.loc['pucks','datasets']['epithelial'].obs[specs.loc['pucks','region_key']].dtype
regions_dtype = pd.CategoricalDtype(categories=regions_dtype.categories, ordered=True)
for label in selected_labels:
    enrichments = []
    for compartment in compartment_pucks.keys():

        dataset = specs.loc[label,'datasets'][compartment]
        region_key = specs.loc[label,'region_key']
        sample_key = specs.loc[label,'sample_key']
        anno_key = specs.loc[label,'anno_key']

        dataset_enrichments = tc.tl.enrichments(dataset,region_key,anno_key,sample_key=sample_key,**enrichment_method,reference_group='normal',reads=True,)

        dataset_enrichments['State'] = dataset_enrichments[anno_key].map(lambda x: f'{x} ({compartment})')
        dataset_enrichments['region'] = dataset_enrichments[region_key]
        
        enrichments.append(dataset_enrichments)

    enrichments = pd.concat(enrichments)
    enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=sorted(enrichments['State'].unique()), ordered=True))
    enrichments['region'] = enrichments['region'].astype(pd.CategoricalDtype(define_region_ordering(enrichments), ordered=True))
    
    
    for annotate_pvalues in [False]:
        fig = tc.pl.significances(enrichments, p_key, 'region', 'State', annotate_pvalues=annotate_pvalues, value_cluster=False);
        
        fig.savefig(f'{figures_folder}/human_sc_enrichment_region_vs_State_per_compartment_{label}{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
selected_labels = ['pucks','mapped','mouse','human']
for compartment in compartment_pucks.keys():
    enrichments = []
    for label in selected_labels:

        dataset = specs.loc[label,'datasets'][compartment]
        region_key = specs.loc[label,'region_key']
        sample_key = specs.loc[label,'sample_key']
        anno_key = specs.loc[label,'anno_key']

        dataset_enrichments = tc.tl.enrichments(dataset,region_key,anno_key,sample_key=sample_key,**enrichment_method,reference_group='normal',reads=True)

        dataset_enrichments['State'] = dataset_enrichments[anno_key].map(lambda x: f'{x} ({label})')
        dataset_enrichments['region'] = dataset_enrichments[region_key]
        
        enrichments.append(dataset_enrichments)

    enrichments = pd.concat(enrichments)
    enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=[ 'normal VS rest (pucks)', 'normal VS rest (mapped)', 'normal VS rest (mouse)', 'normal VS rest (human)', 'premalignant VS normal (pucks)', 'premalignant VS normal (mapped)', 'premalignant VS normal (mouse)', 'malignant (3weeks) VS normal (mouse)', 'malignant (9weeks) VS normal (mouse)', 'MMRd VS normal (human)', 'MMRp VS normal (human)', ], ordered=True))
    enrichments['region'] = enrichments['region'].astype(pd.CategoricalDtype(define_region_ordering(enrichments), ordered=True))

    for annotate_pvalues in [False]:
        fig = tc.pl.significances(enrichments, p_key, 'region', 'State', annotate_pvalues=annotate_pvalues, value_cluster=False);
    
        fig.savefig(f'{figures_folder}/human_sc_enrichment_region_vs_State_per_dataset_{compartment}{"" if annotate_pvalues else "_plain"}.pdf',bbox_inches='tight')

In [None]:
epi_mouse = mouse_scrna[mouse_scrna.obs['labels'].isin(['Epi'])]
epi_human = pelka2mouse[pelka2mouse.obs['clTopLevel'].isin(['Epi'])].copy()
tc.tl.annotate(epi_human, epi_mouse, 'Epithelial_programs', result_key='Epithelial_programs', method='OT', bisections=4, assume_valid_counts=True,verbose=0);

human_enrichments = tc.tl.enrichments(epi_human,'Epithelial_programs','TMMR',sample_key='PID',**enrichment_method,reads=True,)
mouse_enrichments = tc.tl.enrichments(epi_mouse,'Epithelial_programs','State',sample_key='SampleID',**enrichment_method,reads=True,)
human_enrichments['State'] = human_enrichments['TMMR'].map(lambda x: f'{x} (human)')
mouse_enrichments['State'] = mouse_enrichments['State'].map(lambda x: f'{x} (mouse)')
enrichments = pd.concat([mouse_enrichments,human_enrichments])
state_order = [f'normal (human)',f'normal (mouse)']
for state in human_enrichments['State'].cat.categories:
    if state != 'normal (human)':
        state_order.append(f'{state}')
for state in mouse_enrichments['State'].cat.categories:
    if state != 'normal (mouse)':
        state_order.append(f'{state}')
enrichments['State'] = enrichments['State'].astype(pd.CategoricalDtype(categories=state_order, ordered=True))

sorting_enr = enrichments.query(f'State=="normal (mouse)"&enrichment=="enriched"').set_index('Epithelial_programs')[p_key]
sorting_dep = enrichments.query(f'State=="normal (mouse)"&enrichment!="enriched"').set_index('Epithelial_programs')[p_key]
program_order = pd.DataFrame({'enr':sorting_enr,'dep':sorting_dep}).sort_values(['enr','dep'],ascending=[True,False]).index.to_numpy()
enrichments['Epithelial_programs'] = enrichments['Epithelial_programs'].astype(pd.CategoricalDtype(categories=program_order, ordered=True))

fig = tc.pl.significances(enrichments, p_key, 'Epithelial_programs', 'State');
fig.savefig(f'{figures_folder}/human_sc_enrichment_epithelial_programs_vs_State_per_species.pdf',bbox_inches='tight')

# cell-type associations across samples

In [None]:
all_mouse = mouse_scrna
all_human = pelka2mouse

In [None]:
tc.tl.annotate(all_human, all_mouse, 'cluster', result_key='cluster1e-3', method='OT', bisections=4, assume_valid_counts=True,verbose=1, lamb=1e-3);
tc.utils.merge_annotation(all_human, annotation_key='cluster1e-3', result_key='labels1e-3', mapping=all_mouse.obs[['cluster','labels']].drop_duplicates().groupby('labels')['cluster'].agg(lambda x: list(x.to_numpy())));

In [None]:
for annoext in ['']:
    tc.utils.merge_annotation(all_human, annotation_key=f'labels1e-3{annoext}', result_key=f'mouseL{annoext}', mapping={
    });
    tc.utils.get_maximum_annotation(all_human, f'mouseL{annoext}', f'mouseL{annoext}_cat');
tc.utils.merge_annotation(all_human, annotation_key='clMidwayPr', result_key='humanL', mapping={
    'B': ['B','Plasma'],
    'TNK': ['TCD4', 'NK', 'TCD8', 'TZBTB16', 'Tgd', 'ILC'],
    'Mono':['Mono', 'DC',],
    'Mac':['Macro'],
    'Gran':['Granulo'],
    'Endo':['Endo'],
    'Fibro':['Fibro','Peri','SmoothMuscle', 'Schwann'],
});

In [None]:
import matplotlib
cmap_br = matplotlib.colors.LinearSegmentedColormap.from_list('cmap_br', ['#4d85b5', '#ffffff', '#ff1217'])

if not hasattr(all_human.obs['humanL'], 'cat'):
    all_human.obs['humanL'] = all_human.obs['humanL'].astype('category')
good_labels = ['Epi','B','TNK','Mono','Mac','Gran','Mast','Endo','Fibro']

def compost(sub, norm, reads, labels, sample):
    cont = norm == 'clr'
    reads = reads == 'reads'
    if (not cont) and reads:
        return tc.tl.get_compositions(sub,labels,sample,reads=True)
    elif (not cont) and not reads:
        return tc.tl.get_compositions(sub,labels,sample,reads=False)
    elif cont and reads:
        return tc.tl.get_contributions(sub,labels,sample,reads=True, normalization='clr')
    elif cont and not reads:
        return tc.tl.get_contributions(sub,labels,sample,reads=False, normalization='clr')
    else:
        raise ValueError()

clustered = False
for norm in ['clr']:
    for reads in ['cells']:

        fig,axs = tc.pl.subplots(5,2,axsize=[0.3*len(good_labels),0.3*len(good_labels)],x_padding=1.5,y_padding=1)

        value_cluster,group_cluster=clustered,clustered

        for anno_i,anno in enumerate(['humanL',]):

            axs[anno_i,0].set_title(f'all: {anno}')
            comps = compost(pelka2mouse, norm=norm, reads=reads, labels=anno, sample='PID')
            corr = 1-tc.utils.cdist(comps.T,metric="correlation")
            corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
            corr = corr.loc[good_labels,good_labels]
            tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[anno_i,0], value_cluster=value_cluster, group_cluster=group_cluster);

            axs[anno_i,1].set_title(f'normal: {anno}')
            comps = compost(pelka2mouse.query('TMMR == "normal"'), norm=norm, reads=reads, labels=anno, sample='PID')
            corr = 1-tc.utils.cdist(comps.T,metric="correlation")
            corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
            corr = corr.loc[good_labels,good_labels]
            tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[anno_i,1], value_cluster=value_cluster, group_cluster=group_cluster);

            axs[anno_i,2].set_title(f'not-normal: {anno}')
            comps = compost(pelka2mouse.query('TMMR != "normal"'), norm=norm, reads=reads, labels=anno, sample='PID')
            corr = 1-tc.utils.cdist(comps.T,metric="correlation")
            corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
            corr = corr.loc[good_labels,good_labels]
            tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[anno_i,2], value_cluster=value_cluster, group_cluster=group_cluster);

            axs[anno_i,3].set_title(f'MMRd: {anno}')
            comps = compost(pelka2mouse.query('TMMR == "MMRd"'), norm=norm, reads=reads, labels=anno, sample='PID')
            corr = 1-tc.utils.cdist(comps.T,metric="correlation")
            corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
            corr = corr.loc[good_labels,good_labels]
            tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[anno_i,3], value_cluster=value_cluster, group_cluster=group_cluster);

            axs[anno_i,4].set_title(f'MMRp: {anno}')
            comps = compost(pelka2mouse.query('TMMR == "MMRp"'), norm=norm, reads=reads, labels=anno, sample='PID')
            corr = 1-tc.utils.cdist(comps.T,metric="correlation")
            corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
            corr = corr.loc[good_labels,good_labels]
            tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[anno_i,4], value_cluster=value_cluster, group_cluster=group_cluster);


        axs[-1,0].set_title('all mouse')
        comps = compost(mouse_scrna, norm=norm, reads=reads, labels='labels', sample='SampleID')
        corr = 1-tc.utils.cdist(comps.T,metric="correlation")
        corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
        corr = corr.loc[good_labels,good_labels]
        tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[-1,0], value_cluster=value_cluster, group_cluster=group_cluster);

        axs[-1,1].set_title('normal mouse')
        comps = compost(mouse_scrna.query('State == "normal"'), norm=norm, reads=reads, labels='labels', sample='SampleID')
        corr = 1-tc.utils.cdist(comps.T,metric="correlation")
        corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
        corr = corr.loc[good_labels,good_labels]
        tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[-1,1], value_cluster=value_cluster, group_cluster=group_cluster);

        axs[-1,2].set_title('not-normal mouse')
        comps = compost(mouse_scrna.query('State != "normal"'), norm=norm, reads=reads, labels='labels', sample='SampleID')
        corr = 1-tc.utils.cdist(comps.T,metric="correlation")
        corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
        corr = corr.loc[good_labels,good_labels]
        tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[-1,2], value_cluster=value_cluster, group_cluster=group_cluster);

        axs[-1,3].axis('off')

        axs[-1,4].axis('off')

        fig.savefig(f'{figures_folder}/human_sc_labels_associations_across_samples.pdf',bbox_inches='tight')

# program associations across samples

In [None]:
epi_mouse = mouse_scrna.query('labels=="Epi"')
epi_human = pelka2mouse[pelka2mouse.obs['clTopLevel'].isin(['Epi'])].copy()
tc.tl.annotate(epi_human, epi_mouse, 'Epithelial_programs', result_key='Epithelial_programs', method='OT', bisections=4, assume_valid_counts=True,verbose=0);

In [None]:
cmap_br = LinearSegmentedColormap.from_list('cmap_br', ['#4d85b5', '#ffffff', '#ff1217'])

progs = epi_mouse.obsm['Epithelial_programs'].columns
good_progs = progs[progs.str.endswith(')')]

# find clustering for mouse
comps = compost(epi_mouse.query('State != "normal"'), 'clr', reads=False, labels='Epithelial_programs', sample='SampleID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
Z = scipy.cluster.hierarchy.linkage(corr.T, method='average', metric='cosine')
dn = scipy.cluster.hierarchy.dendrogram(Z, ax=None, orientation='right', color_threshold=0, above_threshold_color='tab:gray', no_plot=True)
reordering = pd.Series(dn['ivl']).astype(np.int).to_numpy()
good_progs = corr.iloc[reordering,reordering].columns.to_numpy()

def compost(sub, norm, reads, labels='clTopLevel', sample='PID'):
    print(tc.tl.get_compositions(sub,labels,sample,reads=reads).min()['Program 14 (Angiogenesis)'])
    reads = reads == 'reads'
    if norm == 'frac':
        return tc.tl.get_compositions(sub,labels,sample,reads=reads)
    elif norm == 'clr':
        return tc.tl.get_contributions(sub,labels,sample,reads=reads, normalization='clr')
    else:
        raise ValueError()

clustered = False

fig,axs = tc.pl.subplots(7,1,axsize=[0.3*len(good_progs),0.3*len(good_progs)],x_padding=0.5,y_padding=2, sharey=True)

value_cluster,group_cluster=clustered,clustered

axs[0,0].set_title('all mouse')
comps = compost(epi_mouse, 'clr', reads=False, labels='Epithelial_programs', sample='SampleID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,0], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,1].set_title('normal mouse')
comps = compost(epi_mouse.query('State == "normal"'), 'clr', reads=False, labels='Epithelial_programs', sample='SampleID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,1], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,2].set_title('not-normal mouse')
comps = compost(epi_mouse.query('State != "normal"'), 'clr', reads=False, labels='Epithelial_programs', sample='SampleID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,2], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,3].set_title('all human')
comps = compost(epi_human, 'clr', reads=False, labels='Epithelial_programs', sample='PID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,3], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,4].set_title('normal human')
comps = compost(epi_human.query('TMMR == "normal"'), 'clr', reads=False, labels='Epithelial_programs', sample='PID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,4], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,5].set_title('MMRd human')
comps = compost(epi_human.query('TMMR == "MMRd"'), 'clr', reads=False, labels='Epithelial_programs', sample='PID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,5], value_cluster=value_cluster, group_cluster=group_cluster, colorbar=False);

axs[0,6].set_title('MMRp only')
comps = compost(epi_human.query('TMMR == "MMRp"'), 'clr', reads=False, labels='Epithelial_programs', sample='PID')
corr = 1-tc.utils.cdist(comps.T,metric="correlation")
corr = pd.DataFrame(corr, index=comps.columns, columns=comps.columns)
corr = corr.loc[good_progs,good_progs]
tc.pl.heatmap(corr,None,None,cmap=cmap_br,cmap_center=0, ax=axs[0,6], value_cluster=value_cluster, group_cluster=group_cluster, );

fig.savefig(f'{figures_folder}/human_sc_program_associations_across_samples.pdf',bbox_inches='tight')