# Genes that are associated with FOXA2 expression


Find genes correlated with FOXA2's expression, in a patient specific manner using (ridge)-regression.

## Find genes associated with FOXA2 expression

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV, RidgeClassifierCV
from sklearn.preprocessing import StandardScaler
import scanpy as sc
from adjustText import adjust_text
import matplotlib.pyplot as plt
from common_utils import setup_dirs, find_arial_font

find_arial_font()

outDir = OUTDIR_FOXA2_ASSOCIATION
figuresDir, dataDir, tablesDir = setup_dirs(outDir)

sc.settings.figdir = figuresDir
sc.set_figure_params(dpi_save=300, vector_friendly=True)

In [None]:
# Helper functions to run regression
def run_regression(X_in, y):
    """
    Run ridge regression to predict y from X_in.

    Parameters
    ----------
    X_in : np.ndarray
        Input data matrix. Cells by genes.
    y : np.ndarray

    
    Returns
    -------
    betas : np.ndarray
        Coefficients of the ridge regression model.

    
    Notes
    -----
    Uses RidgeClassifierCV if y is binary, else RidgeCV.
    Scales X_in (and y if response is non-binary) before running regression.
    The beta coefficients are transformed back to the original scale.
    """
    scaler_x = StandardScaler()
    X_in = scaler_x.fit_transform(X_in)
    # if y is binary, use RidgeClassifierCV, else use RidgeCV
    if len(np.unique(y)) == 2:
        clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), cv=None)
        clf.fit(X_in, y)
        coeffs = np.squeeze(clf.coef_[:, 0:X_in.shape[1]])
        betas = coeffs / scaler_x.scale_
    else:
        scaler_y = StandardScaler()
        Y_in = scaler_y.fit_transform(y.reshape(-1, 1))
        clf = RidgeCV(alphas=np.logspace(-3, 3, 10), cv=None)
        clf.fit(X_in, Y_in)
        coeffs = clf.coef_[0:X_in.shape[1]]
        betas = scaler_y.scale_ * coeffs / scaler_x.scale_
    return betas


def get_x_y(adata, sample_id, target_gene=None):
    """
    Extracts expression matrix X and target gene expression Y for a given sample.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.
    sample_id : str
        Sample ID.
    target_gene : str
        Gene to predict.
    
    Returns
    -------
    X : np.ndarray
        Expression matrix.
    y : np.ndarray
        Expression of target gene.    
    """
    assert target_gene is not None, "target_gene must be specified."
    assert 'sample' in adata.obs.columns, "adata.obs['sample'] must be present."
    assert sample_id in adata.obs['sample'].values, f"Sample {sample_id} not found in adata.obs['sample']."
    bdata = adata[adata.obs['sample'] == sample_id, :].copy()
    # Y: expression of FOXA2
    y = bdata[:, target_gene].X.A
    # X: expression of all genes, except FOXA2
    tmp = bdata[:, bdata.var_names != target_gene].copy()
    gene_names = tmp.var_names 
    X = tmp.X.A
    return X, y, gene_names

In [None]:
# Helper function to plot the Z-score vs rank
def plot_scatter(al_beta_summary=None, x_col='rank', y_col='zscore', annotate_genes=None, filePath=None, annot_fsize=5, MAX_SIG_ITEMS=5):
    """
    Scatter plot of rank vs zscore. 

    Parameters
    ----------
    al_beta_summary : pd.DataFrame
        Summary of the regression analysis. Columns: rank, zscore.
    x_col : str
        Column to use for x-axis.
    y_col : str
        Column to use for y-axis.
    annotate_genes : list
        List of genes to annotate.
    filePath : str
        Path to save the plot.
    annot_fsize : int
        Font size for gene annotations.
    MAX_SIG_ITEMS : int
        Maximum number of significant genes to annotate.
    
    Returns
    -------
    None
    """
    plt.clf()
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300)
    fig.set_facecolor('white')
    ax.scatter(al_beta_summary['rank'], al_beta_summary['zscore'], s=2)
    # Only show 5 evenly spaced labels for rank
    ax.set_xticks(np.linspace(0, al_beta_summary.shape[0], 5))
    # Annotate genes
    texts = []
    if MAX_SIG_ITEMS > 0:
        if annotate_genes is not None:
            significant = al_beta_summary.loc[annotate_genes]
        else:
            significant = al_beta_summary[al_beta_summary['zscore'] > 1]
        for i in range(len(significant)):
            if i > MAX_SIG_ITEMS:
                print(f'Not naming more than {MAX_SIG_ITEMS} genes.')
                break
            x = significant.iloc[i, :]["rank"]
            y = significant.iloc[i, :]["zscore"]
            texts.append(
                plt.text(
                    x=x,
                    y=y,
                    s=significant.index.values[i],
                    fontsize=annot_fsize,
                )
            )
            ax.scatter(x, y, color='red', s=5)
        adjust_text(texts, arrowprops=dict(arrowstyle="-", color="k", lw=0.5))
    if x_col == 'rank':
        ax.set_xlabel('Rank')
    if y_col == 'zscore':
        ax.set_ylabel('Z-score')
    fig.savefig(filePath, bbox_inches='tight')
    plt.close()

## Per sample, regress FOXA2 on highly variable genes

In [None]:
adata_path = ADATA_PATH_FOXA2_ASSOCIATION
adata = sc.read(adata_path)

target_gene = 'FOXA2'

# Run logistic regression, regressing FOXA2 expression on highly variable genes per sample
for i, ss in enumerate(adata.obs['sample'].unique()):
    print(f'Running regression for {ss} ({i}/{adata.obs["sample"].unique().shape[0]})')
    X, Y, gene_names = get_x_y(adata, sample_id=ss, target_gene=target_gene)
    betas = run_regression(X, Y)
    # Save the betas per sample
    betas_df = pd.DataFrame({'gene_name': gene_names, 'beta': betas.squeeze()})
    betas_df = betas_df.sort_values(by='beta', ascending=False)
    betas_df.to_csv(os.path.join(outDir, 'tables', f'betas_{ss}.csv'), index=False)

# Combine all betas
all_betas = None
for i, ss in enumerate(adata.obs['sample'].unique()):
    print(f'Loading betas for {ss} ({i}/{adata.obs["sample"].unique().shape[0]})')
    betas_df = pd.read_csv(os.path.join(outDir, 'tables', f'betas_{ss}.csv'))
    if all_betas is None:
        all_betas = betas_df
    else:
        all_betas = pd.concat([all_betas, betas_df])
    
# Compute the per gene mean and sd
all_beta_summary = all_betas.groupby('gene_name').agg({'beta': ['mean', 'std']})
all_beta_summary = all_beta_summary.sort_values(by=('beta', 'mean'), ascending=False)
all_beta_summary.columns = ['_'.join(col).strip() for col in all_beta_summary.columns.values]
all_beta_summary.to_csv(os.path.join(outDir, 'tables', 'all_beta_summary.csv'), index=True)


## Plot the Z-score by rank

In [None]:
# Load the saved beta summary
al_beta_summary = pd.read_csv(os.path.join(outDir, 'tables', 'all_beta_summary.csv'), index_col=0)
al_beta_summary["zscore"] = np.abs(al_beta_summary["beta_mean"]) / al_beta_summary["beta_std"]
al_beta_summary = al_beta_summary.sort_values(by="zscore", ascending=False)
al_beta_summary['rank'] = range(1, al_beta_summary.shape[0] + 1)

# Plot a scatter plot and annotate specific genes in list
annotate_genes = al_beta_summary[al_beta_summary['zscore'] > 1].index.values.tolist()

for ext in ['png', 'pdf']:
    filePath = os.path.join(figuresDir, f'zscore_vs_rank_all_genes_unannot.{ext}')
    plot_scatter(al_beta_summary=al_beta_summary, x_col='rank', y_col='zscore', annotate_genes=None, filePath=filePath, annot_fsize=10)