# Computing and plotting fetal gene expression scores

## Setup Directories

In [None]:
import scanpy as sc
from common_utils import setup_dirs, find_arial_font, add_gene_binary_status, load_msigdb
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import mannwhitneyu

find_arial_font()

outDir = OUTDIR_FETAL_SCORES
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
# Utility plotting function
def plot_score_violin(df, score, figsize=(6, 6), fpath=None):
    """
    Plot the violin plot for the given score
    
    Parameters
    ----------
    df: pd.DataFrame
        The dataframe containing the score and the FOXA2 status. The dataframe should have the following columns:
        - FOXA2_is_expressed: The FOXA2 status
        - score: The score to plot
    score: str
        The score to plot
    
    Returns
    -------
    None
    """
    # Check the data.frame 
    cols = ['FOXA2_is_expressed', score]
    assert all([col in df.columns for col in cols]), f"Missing columns in the dataframe: {cols}"
    order = [False, True]
    q95 = df[score].quantile(1)
    qmax = df[score].max()
    x1, x2 = 0, 1
    y, h, col = q95*1.1, q95*.05, 'k'
    plt.clf()
    plt.figure(figsize=figsize)
    sns.set(font_scale=1.5)
    sns.set_style("white")
    ax = sns.violinplot(data=df, x='FOXA2_is_expressed', y=score, order=order, hue='FOXA2_is_expressed')
    plt.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
    plt.text((x1+x2)*.5, y+h, f"p < 0.01", ha='center', va='bottom', color=col)
    # Set x-axis (FOXA2-, FOXA2+ )
    plt.xticks([0, 1], ['FOXA2-', 'FOXA2+'])
    ylabel_str = score.replace('HE_LIM_SUN_FETAL_LUNG_', '')
    # Break it into two lines by replacing the third _ with a newline
    ylabel_str = ylabel_str.split('_')
    for i in range(len(ylabel_str)):
        if i == 2:
            ylabel_str[i] = ylabel_str[i] + '\n'
    # Stich them back together
    ylabel_str = ' '.join(ylabel_str)
    plt.ylabel(f'{ylabel_str} Signature')
    plt.xlabel('')
    plt.legend([],[], frameon=False)
    plt.scatter(0, qmax*1.2, color='white')
    plt.tight_layout()
    if fpath:
        plt.savefig(fpath, bbox_inches='tight', dpi=300)
    else:
        plt.show()
    plt.close()

## Extract the gene sets reported in HE_LIM_SUN_FETAL 

Ref: He, Peng, et al. "A human fetal lung cell atlas uncovers proximal-distal gradients of differentiation and key regulators of epithelial fates." Cell 185.25 (2022): 4841-4860.


In [None]:
adata_path = ADATA_PATH_FETAL_SCORES
adata = sc.read(adata_path)

# Loads MSigDB
msigdb = load_msigdb()
genesets = msigdb.geneset.unique().tolist()

# Extract the fetal 
query_keys = ['stem', 'fetal']
fetal_set = []
for geneset in genesets:
    for key in query_keys:
        if key in geneset.lower():
            print(geneset)
            fetal_set.append(geneset)

he_lim = []
for geneset in fetal_set:
    if 'lung' in geneset.lower():
        if 'HE_LIM_SUN_FETAL'.lower() in geneset.lower():
            print(geneset)
            he_lim.append(geneset)

# Now remove HE_LIM_SUN_FETAL_LUNG_ from the names
# These are from Big Cluster in supplementary figure 1
he_lim = [x.replace('HE_LIM_SUN_FETAL_LUNG_', '') for x in he_lim]
# Find the cluster by splitting by _ aand taking the first part
clusters = [x.split('_')[0] for x in he_lim]
dd = pd.DataFrame({'cluster': clusters, 'geneset': he_lim})
# Focus only cluster C0
dd[dd['cluster'] == 'C0']
# find if there is fetal in column geneset
dd[dd['geneset'].str.contains('fetal')]
dd['orig'] = [f"HE_LIM_SUN_FETAL_LUNG_{x}" for x in dd['geneset']]

score_names = dd['orig'][dd['cluster'] == 'C1'].values
for sig in score_names:
    print(sig)
    genes = msigdb[msigdb['geneset'] == sig]['genesymbol'].values
    print(len(genes))
    sc.tl.score_genes(adata, genes, score_name=sig, use_raw=False, copy=False)

In [None]:
# Exteact the scores, compute p-values, then plot them individually
df = adata.obs[['sample', 'FOXA2_is_expressed'] + list(score_names)].copy()

# Compute the p-value of the test between FOXA2_is_expressed (TRUE vs FALSE) of each score
pvals = {}
for score in score_names:
    group1_scores = df[df['FOXA2_is_expressed'] == True][score]
    group2_scores = df[df['FOXA2_is_expressed'] == False][score]
    u, p = mannwhitneyu(group1_scores, group2_scores, alternative='greater')
    pvals[score] = p

# Convert to dataframe
pvals_df = pd.DataFrame({'score': list(pvals.keys()), 'p-value': list(pvals.values())})

# Compute how many have significant pvalues
pvals_df['significant'] = pvals_df['p-value'] < 0.01

In [None]:
significant_scores = pvals_df[pvals_df['significant']]['score'].values.tolist()

# Subset for these scores
df_sub = df[['sample', 'FOXA2_is_expressed'] + significant_scores].copy()

# Plot the violin plots for each of the significant scores
for score in significant_scores:
    plot_score_violin(df_sub, score, fpath=os.path.join(figuresDir, f'{score}_violin.pdf'))

