<font size="+3.8">Study-by-gene heatmap/dotplot</font>  
<font size="+1.5"></font>  

Aim: Combine and plot expression from processed scRNA-seq studies. Processing see separate scripts.

In [None]:
from datetime import date
date.today().strftime('%d/%m/%Y')

In [None]:
import os
os.getlogin()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
import glob
from pathlib import Path
import platform
import seaborn as sns
sc.settings.verbosity = 3

In [None]:
import utils

In [None]:
os.environ['CONDA_DEFAULT_ENV'] # conda env

In [None]:
platform.platform()

In [None]:
main_dir='\\\isdsynnas.srv.med.uni-muenchen.de\BD-Dichgans\SF' # Win
main_dir='/Volumes/BD-Dichgans/SF'

In [None]:
study_order = ["OwnData", "Vanlandewijck2018", "Saunders2018", "Zeisel2018", 
        "TabulaMuris2018", "Winkler2022", "Yang2022", "Siletti2022", "Garcia2022"]
celltype_order = ["Astrocytes", "Microglia/Mφ", "Oligodendrocytes", "OPCs", 
        "Endothelial cells", "Pericytes", "SMCs", "Fibroblasts", 
        "Neurons", "Neuroblasts/NSCs", "Ependymal cells"]   

In [None]:
study_order_zonation = ["Vanlandewijck2018","Winkler2022", "Yang2022", "Garcia2022"]
celltype_order_zonation = ["Astrocytes","Microglia/Mφ", "Oligodendrocytes", "OPCs", 
        "aECs", "capECs", "vECs", "Pericytes", "SMCs", "Fibroblasts", 
        "Neurons", "Neuroblasts/NSCs", "Ependymal cells"]

# Foxf2

## Excl zonation

In [None]:
gene="Foxf2"
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Foxf2_summarized'

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True)

In [None]:
df.head()

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order, celltype_order)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap')
with rc_context({'figure.figsize': (4.5,2.5), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(12, 4),
                     show=False)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned)

## Incl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Foxf2_summarized' / 'incl_zonation'
cluster_key = "clusters2"

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True
)

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order_zonation, celltype_order_zonation, cluster_key=cluster_key)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap_incl_zonation')
with rc_context({'figure.figsize': (4.5,3), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, cluster_key=cluster_key, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot_incl_zonation')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(14,5.5),
                     show=False, 
                     cluster_key=cluster_key)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned, cluster_key=cluster_key)

## EDA

In [None]:
date_str = "20250306"
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Foxf2_summarized'

In [None]:
all_files = list(path.glob(f"{date_str}*.csv"))
all_files = [f for f in all_files if gene.lower() in f.stem.lower()]
#all_files
dataframes = [pd.read_csv(os.path.join(path, filename), index_col=None, header=0, sep=";") 
              for filename in all_files]
df = pd.concat(dataframes, axis=0, ignore_index=True)

In [None]:
df.source.unique()

In [None]:
df.gene.unique()

In [None]:
df.clusters.value_counts()

In [None]:
pd.options.display.max_columns=50
pd.crosstab(df.source, df.clusters)

## Weighted mouse/human mean (not used)

In [None]:
# mean expression and fraction of cells weigthed by number of cell studies
# exploratory - not used

In [None]:
def weighted_mean(group):
    weight_col = 'cell_number'
    weighted_mean_expr = (group['mean_expression'] * group[weight_col]).sum() / group[weight_col].sum()
    weighted_mean_frac_cells = (group['fraction_of_cells'] * group[weight_col]).sum() / group[weight_col].sum()
    total_cell_number = group[weight_col].sum()
    return pd.Series({
        'mean_expression': weighted_mean_expr,
        'fraction_of_cells': weighted_mean_frac_cells,
        'cell_number': total_cell_number
    })

In [None]:
all2_mean_own = df_cleaned[df_cleaned.source == "Own data"]
#all2_mean_own

In [None]:
all2_mean = df_cleaned[df_cleaned.source != "Own data"]

In [None]:
# Option: Exclude Yang and Saunders (as outlier studies)
all2_mean = all2_mean[all2_mean["source"] != "Yang, 2022, Nature"]
all2_mean = all2_mean[all2_mean["source"] != "Saunders, 2018, Cell"]

In [None]:
weighted_mean_df = all2_mean.groupby(['clusters', 'organism', 'gene']).apply(weighted_mean).reset_index()
weighted_mean_df = weighted_mean_df.dropna()

In [None]:
ECs = all2_mean[all2_mean.clusters == "Endothelial cells"]
ECs = ECs[ECs.organism == "Human"]
ECs

In [None]:
EC_mouse = all2_mean[all2_mean.clusters == "Endothelial cells"]
EC_mouse = EC_mouse[EC_mouse.organism == "Mouse"]
EC_mouse

In [None]:
PC_mouse = all2_mean[all2_mean.clusters == "Pericytes®"]
PC_mouse = PC_mouse[PC_mouse.organism == "Mouse"]
PC_mouse

In [None]:
# verify
(ECs['mean_expression'] * ECs['cell_number']).sum()/ECs['cell_number'].sum()

In [None]:
weighted_mean_df

In [None]:
# Dotplot

In [None]:
#all2_mean_own

In [None]:
weighted_mean_df["source"] = "meta-analysis"
#weighted_mean_df

In [None]:
weighted_means = pd.concat([weighted_mean_df, all2_mean_own])
weighted_means['source'] = weighted_means['source'] + "-" + weighted_means['organism']
weighted_means

In [None]:
weighted_means.cell_number.sum()

In [None]:
# create color palette
mycolormap = mpl.colors.LinearSegmentedColormap.from_list("", ['#d1d1d1','tomato'])
mpl.colormaps.register(mycolormap, force = True)
#plt.cm.register_cmap("mycolormap", my_cmap2)
cpal2 = sns.color_palette("mycolormap", as_cmap=True)

In [None]:
weighted_means['fraction_of_cells'] = weighted_means['fraction_of_cells']*100

In [None]:
weighted_means.columns = ['clusters', 'organism', 'gene', 'Mean expression\n       in group', 'Fraction of cells\n   in group (%)',
       'cell_number', 'source']

In [None]:
sns.set(style="white")
pl=sns.relplot(data=weighted_means, x="source", y="clusters",
                hue='Mean expression\n       in group', size='Fraction of cells\n   in group (%)',
                #size_norm=(50, 500),
                palette=cpal2, 
                sizes=(15, 550), # tile size limits
                #marker="s", # round or squared
                linewidth=1,
                #legend=None,
                #aspect=0.9
              );
pl.set(ylabel=None,xlabel=None)
#plt.title("KO vs WT: Enriched gene sets", y=1.02, fontsize=16)
#plt.suptitle("Top "+str(topx_OR)+" DE genes of each brain region, sorted by OR", y=0.97, fontsize=13)
plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
plt.xticks(rotation=45, ha = 'left')
pl.set_xticklabels(fontsize=14.4, family="arial", color="black")
#plt.xlabel(family='Arial')
pl.set_yticklabels(fontsize=14.1, family="arial", color="black")
pl.fig.set_size_inches(8.7,4.2)
#plt.legend(loc="upper right")
#sns.despine(bottom = False, left = False, right = False, top = False)
sns.despine(left=True, bottom=True);
#sns.despine(bottom=True, left=True, top=False)

# sns legend
leg = pl._legend
#leg.set_bbox_to_anchor([1.5,0.53])
leg.remove();

# mpl legend
legend = plt.legend(frameon=True, framealpha=0.2, borderpad=0.5, bbox_to_anchor=(1,1), title=gene, # handletextpad=0.7, 
           prop=mpl.font_manager.FontProperties(family='arial', size=10), labelcolor='black')
plt.setp(legend.get_title(), color='black', family='arial', size=13);

In [None]:
plt.show()

In [None]:
pl.savefig(os.path.join(main_dir,'P06_Foxf2_per_celltype','plots',date.today().strftime("%Y%m%d")+'_Foxf2_by_celltype_weightedmean.png'), dpi=500)
pl.savefig(os.path.join(main_dir,'P06_Foxf2_per_celltype','plots',date.today().strftime("%Y%m%d")+'_Foxf2_by_celltype_weightedmean.svg'), dpi=500)

# Tek

In [None]:
gene="Tek"

## Excl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized'

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True)

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order, celltype_order)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap')
with rc_context({'figure.figsize': (4.5,2.5), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(10, 4),
                     show=False)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned)

## Incl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized' / 'incl_zonation'

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True)

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order_zonation, celltype_order_zonation, cluster_key=cluster_key)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap_incl_zonation')
with rc_context({'figure.figsize': (4.5,3), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, cluster_key=cluster_key, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot_incl_zonation')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(14,5.5),
                     show=False, 
                     cluster_key=cluster_key)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned, cluster_key=cluster_key)

# Foxo1

In [None]:
gene="Foxo1"

## Excl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized'

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True)

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order, celltype_order)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap')
with rc_context({'figure.figsize': (4.5,2.5), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(10, 4),
                     show=False)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned)

## Incl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized' / 'incl_zonation'
cluster_key = "clusters2"

In [None]:
# load data
df = pd.concat(
    [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
     if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
    ignore_index=True)

In [None]:
df.source.unique()

In [None]:
df_cleaned = utils.clean_and_standardize_data(df, gene, study_order_zonation, celltype_order_zonation, cluster_key=cluster_key)

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap_incl_zonation')
with rc_context({'figure.figsize': (4.5,3), 'figure.dpi': 120}):
    utils.create_heatmap(df_cleaned, gene, cluster_key=cluster_key, show=False)
    plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
    plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
    plt.show() 

In [None]:
out_path_heat = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots', 
                             f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot_incl_zonation')
utils.create_dotplot(df_cleaned, gene, 
                     min_tile=15, # min_tile=0 shows no dot if fraction of cells is 0
                     figsize=(14,5.5),
                     show=False, 
                     cluster_key=cluster_key)
plt.savefig(out_path_heat+".png", dpi=500, bbox_inches='tight')
plt.savefig(out_path_heat+".svg", dpi=500, bbox_inches='tight')
plt.show() 

In [None]:
utils.get_cell_numbers(df_cleaned, cluster_key=cluster_key)

# Other genes

In [None]:
target_genes = ["Nos3", "Htra1", "Egfl8", "Flt1", "Kdr", "Ptprb", "Nrp1", "Nrp2", "Efnb2", "Itgb1", "Itga6", "Angpt2", "Cdh5", "Cldn5", "Ocln", "Ctnnb1"]

## Excl zonation

In [None]:
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized'

In [None]:
for gene in target_genes:
    print(f"Processing {gene}...")    
    
    df = pd.concat(
        [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
         if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
        ignore_index=True)
    
    # Process data
    df_cleaned = utils.clean_and_standardize_data(df, gene, study_order, celltype_order)
    
    # Heatmap
    plot_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots')
    with rc_context({'figure.figsize': (5.5, 2.5), 'figure.dpi': 120}):
        utils.create_heatmap(df_cleaned, gene, show=False)
        for ext in [".png", ".svg"]:
            out_path = os.path.join(plot_dir, f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap{ext}')
            plt.savefig(out_path, dpi=500, bbox_inches='tight')
        plt.close()
    
    # Dotplot
    utils.create_dotplot(df_cleaned, gene, min_tile=15, figsize=(12, 4), show=False)
    for ext in [".png", ".svg"]:
        out_path = os.path.join(plot_dir, f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot{ext}')
        plt.savefig(out_path, dpi=500, bbox_inches='tight')
    plt.close()
    
    # cell numbers
    #utils.get_cell_numbers(df_cleaned)
    
    print(f"Completed processing {gene}")

## Incl zonation

In [None]:
cluster_key = "clusters2"
date_str_list = ["20250306"]
path = Path(main_dir) / 'P06_Foxf2_per_celltype' / 'Other_genes_summarized' / 'incl_zonation'

In [None]:
for gene in target_genes:
    print(f"Processing {gene}...")
        
    df = pd.concat(
        [pd.read_csv(f, sep=";") for f in path.glob("*.csv") 
         if any(d in f.stem for d in date_str_list) and gene.lower() in f.stem.lower()],
        ignore_index=True)
    
    # Process data
    df_cleaned = utils.clean_and_standardize_data(df, gene, study_order_zonation, celltype_order_zonation, cluster_key=cluster_key)
    
    # Heatmap
    plot_dir = os.path.join(main_dir, 'P06_Foxf2_per_celltype', 'plots')
    with rc_context({'figure.figsize': (4.5, 3), 'figure.dpi': 120}):
        utils.create_heatmap(df_cleaned, gene, cluster_key=cluster_key, show=False)
        for ext in [".png", ".svg"]:
            out_path = os.path.join(plot_dir, f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_heatmap_incl_zonation{ext}')
            plt.savefig(out_path, dpi=500, bbox_inches='tight')
        plt.close()
    
    # Dotplot
    utils.create_dotplot(df_cleaned, gene, min_tile=15, cluster_key=cluster_key, figsize=(14, 5.5), show=False)
    for ext in [".png", ".svg"]:
        out_path = os.path.join(plot_dir, f'{date.today().strftime("%Y%m%d")}_{gene}_study_by_celltype_dotplot_incl_zonation{ext}')
        plt.savefig(out_path, dpi=500, bbox_inches='tight')
    plt.close()
    
    # cell numbers
    #utils.get_cell_numbers(df_cleaned)
    
    print(f"Completed processing {gene}")

# Session Info

In [None]:
sc.logging.print_versions()

# Outdated code

### Merge EC zonation

Outdated. Previously used to calculate weighted mean expression on ECs based on data from ECs split into zonation clusters. Now zonation clusters were merged already in anndata object, which is more accurate.

In [None]:
# merge EC zonation as weighted mean expression 
# in Yang2022 Winkler2022 Vanlandewijck2018

In [None]:
#Vanlandewijck2018 = all[all.source == "Vanlandewijck2018"]
#Vanlandewijck2018

In [None]:
# rename
#Vanlandewijck2018["clusters"] = Vanlandewijck2018.loc[:,"clusters"].str.split("_",expand=True)[0].tolist()
#Vanlandewijck2018

In [None]:
#Winkler2022 = all[all.source == "Winkler2022"]
#Winkler2022

In [None]:
# rename
#Winkler2022["clusters"] = Winkler2022.loc[:,"clusters"].str.split("_",expand=True)[0].tolist()
#Winkler2022

In [None]:
# Yang2022 = all[all.source == "Yang2022"]
# Yang2022

In [None]:
# Yang2022["clusters"] = Yang2022.loc[:,"clusters"].str.split("_",expand=True)[0].tolist()
# Yang2022

In [None]:
# remove original rows, then append new ones

In [None]:
# all.loc[Yang2022.index[0]:Yang2022.index[-1],:] = None
# all.loc[Winkler2022.index[0]:Winkler2022.index[-1],:] = None
# all.loc[Vanlandewijck2018.index[0]:Vanlandewijck2018.index[-1],:] = None
# all = all.dropna()

In [None]:
# compute new rows via weighted mean

In [None]:
# wm = lambda x: np.average(x, weights=Yang2022.loc[x.index, "cell_number"])

# # Groupby and aggregate with namedAgg [1]:
# Yang2022_weighted_mean = Yang2022.groupby(["clusters", "gene", "source", "organism"]).agg(cell_number=("cell_number", "sum"),  
#                                                                  mean_expression=("mean_expression", wm),
#                                                                  fraction_of_cells=("fraction_of_cells", wm)
#                                                                 )
# Yang2022_weighted_mean=Yang2022_weighted_mean.reset_index()[Yang2022.columns]
# Yang2022_weighted_mean

Contains correct weighted mean of mean_expression and fraction_of_cell (double checked)

In [None]:
# wm = lambda x: np.average(x, weights=Vanlandewijck2018.loc[x.index, "cell_number"])

# # Groupby and aggregate with namedAgg [1]:
# Vanlandewijck2018_weighted_mean = Vanlandewijck2018.groupby(["clusters", "gene", "source", "organism"]).agg(cell_number=("cell_number", "sum"),  
#                                                                  mean_expression=("mean_expression", wm),
#                                                                  fraction_of_cells=("fraction_of_cells", wm)
#                                                                 )
# Vanlandewijck2018_weighted_mean=Vanlandewijck2018_weighted_mean.reset_index()[Vanlandewijck2018.columns]
# Vanlandewijck2018_weighted_mean

In [None]:
# wm = lambda x: np.average(x, weights=Winkler2022.loc[x.index, "cell_number"])

# # Groupby and aggregate with namedAgg [1]:
# Winkler2022_weighted_mean = Winkler2022.groupby(["clusters", "gene", "source", "organism"]).agg(cell_number=("cell_number", "sum"),  
#                                                                  mean_expression=("mean_expression", wm),
#                                                                  fraction_of_cells=("fraction_of_cells", wm)
#                                                                 )
# Winkler2022_weighted_mean=Winkler2022_weighted_mean.reset_index()[Winkler2022.columns]
# Winkler2022_weighted_mean

In [None]:
# # append to all
# all2 = pd.concat([all, Winkler2022_weighted_mean, Yang2022_weighted_mean, Vanlandewijck2018_weighted_mean], axis=0)
# all2 = all2.reset_index().drop("index",axis=1)

### Complement missing celltypes 

In [None]:
# complement missing celltypes as NA rows

In [None]:
# all_celltypes = pd.Series(all2.clusters.unique())
# for s in all2.source.unique():
#     # find cell types missing per source
#     key_diff = set(all_celltypes).difference(all2[all2.source==s].clusters)
#     where_diff = all_celltypes.isin(key_diff)
#     missing_celltypes=all_celltypes[where_diff]
#     # append these to all2 as NA row
#     for m in missing_celltypes:
#         all2.loc[len(all2)] = [m,None,None,None,s,None,None]

# assert(len(all2) == len(all2.clusters.unique())*len(all2.source.unique()))

---
# Export HTML

In [None]:
# Use nbconvert conda env
! cp combine_foxf2_results.ipynb HTMLs/$(date '+%Y%m%d')_combine_foxf2_results.ipynb
! jupyter nbconvert HTMLs/$(date '+%Y%m%d')_combine_foxf2_results.ipynb --to html_toc
! rm HTMLs/$(date '+%Y%m%d')_combine_foxf2_results.ipynb