# Transcription Factor Project - Differential Expression Analysis and Minimum Distortion Embedding (Pipeline Steps G-I)
**Robin Anwyl, UCSD Subramaniam Lab**

**Project Goal:** Analyze the hiPSC Perturb-seq dataset from the Mali lab (Nourreddine et al preprint) to investigate the effects of transcription factor knockouts (TF KOs)

**Notebook Description:** 
-  Dataset: QC'd TF KO (and NTC) dataset
-  Analysis: pseudobulk differential expression analysis (DEA), pairwise Pearson correlation matrix, minimum distortion embedding (MDE)
***

# Import statements

In [1]:
# Using psp_env virtual environment
import sys
import os
repo_root = "/home/ranwyl/KOLF2.1J_Perturbation_Cell_Atlas/"
if repo_root not in sys.path:
    sys.path.insert(0, "/home/ranwyl/KOLF2.1J_Perturbation_Cell_Atlas/")

import psp
import gc
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'DejaVu Sans'

# Differential expression analysis - partitioning method

We will carry out differential expression analysis at the gRNA level with PyDESeq2 using a pseudobulk method. The cells for each gRNA are partitioned evenly into pseudoreplicates and compared to an equal number of NTC cells.

In [2]:
import anndata as ad
import numpy as np
import pandas as pd
from scipy import sparse
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.default_inference import DefaultInference
import gc

def generate_pseudoreplicates_for_DE(adata: ad.AnnData, 
                                     target_value: str, 
                                     ntc_cell_indices: pd.Index, 
                                     rng: np.random.Generator, 
                                     target_column: str = "perturbation", 
                                     layer: str = "counts"):
    """
    Generate independent pseudoreplicates for a given target (gRNA or gene target) 
    and matched NTC cells. Manually set NTC as reference for DE analysis.
    """
    # Create views for target and NTC cells
    if target_column not in adata.obs:
        print(f"Error: {target_column} not in adata.obs")
        return
    target_mask = adata.obs[target_column] == target_value
    target_view = adata[target_mask]
    ntc_view = adata[ntc_cell_indices]

    # Get data matrices from counts layer
    if layer in adata.layers:
        target_data = target_view.layers[layer]
        ntc_data = ntc_view.layers[layer]
    else:
        target_data = target_view.X
        ntc_data = ntc_view.X        
    
    # Convert to dense if sparse
    if sparse.issparse(target_data):
        target_data = target_data.toarray()
    if sparse.issparse(ntc_data):
        ntc_data = ntc_data.toarray()

    # Calculate number of cells to sample
    n_target_cells = target_data.shape[0]
    n_ntc_cells = ntc_data.shape[0]
    n_reps = 2 if n_target_cells <= 35 else 3
    target_rep_size = n_target_cells // n_reps
    ntc_rep_size = min(target_rep_size, n_ntc_cells // n_reps)
    if ntc_rep_size < target_rep_size:
        print("Warning: Unmatched number of target and NTC cells")

    # Pre-allocate arrays for results
    target_bulk = np.zeros((n_reps, target_data.shape[1]), dtype=np.int64)
    ntc_bulk = np.zeros((n_reps, ntc_data.shape[1]), dtype=np.int64)

    # Sample cells for all replicates
    target_sample_size = target_rep_size * n_reps
    ntc_sample_size = ntc_rep_size * n_reps
    sampled_target_indices = \
        rng.choice(n_target_cells, target_sample_size, replace=False)
    sampled_ntc_indices = \
        rng.choice(n_ntc_cells, ntc_sample_size, replace=False)
    # Generate replicates using vectorized operations
    for i in range(n_reps):
        # Sample indices
        target_start, target_stop = i*target_rep_size, (i+1)*target_rep_size
        target_rep_indices = sampled_target_indices[target_start:target_stop]
        ntc_start, ntc_stop = i*ntc_rep_size, (i+1)*ntc_rep_size
        ntc_rep_indices = sampled_ntc_indices[ntc_start:ntc_stop]

        # Calculate sums using vectorized operations
        target_bulk[i] = np.sum(target_data[target_rep_indices], axis=0).astype(np.int64)
        ntc_bulk[i] = np.sum(ntc_data[ntc_rep_indices], axis=0).astype(np.int64)  

    # Create sample names
    if "_" in target_value:
        target_value = target_value.replace("_", "-")
    sample_names = [f"{target_value}-rep{i+1}" for i in range(n_reps)]
    control_names = [f"NTC-rep{i+1}" for i in range(n_reps)]
    
    # Combine data
    combined_data = np.vstack([target_bulk, ntc_bulk])
    combined_names = sample_names + control_names
    
    # Create metadata DF
    metadata_df = pd.DataFrame({
        'condition': [target_value] * n_reps + ['NTC'] * n_reps
    }, index=combined_names)
    # Set NTC as reference for DESeq2
    metadata_df["condition"] = pd.Categorical(
        metadata_df["condition"],
        categories=["NTC", target_value],
        ordered=True
        )
    
    # Create counts DF
    counts_df = pd.DataFrame(
        combined_data,
        index=combined_names,
        columns=adata.var_names
    )

    return counts_df, metadata_df

def differential_expression(adata: ad.AnnData, 
                    target_column: str = "perturbation", 
                    ntc_cells_delimiter: str = "NTC", 
                    alpha: float = 0.05, n_cpus: int = 20, 
                    layer: str = "counts", random_state: int = 42, 
                    shrink_lfcs: bool = False, debug: bool = False):
    """
    Run differential expression analysis on each perturbation in the dataset.
    """
    # Identify KD and NTC cells
    perturbations = list(adata.obs[target_column].unique())
    perturbations.remove(ntc_cells_delimiter) # Remove NTC group
    if debug == True: # Debug mode: run with 3 perturbations
        print(f"Running in debug mode with 3 perturbations")
        perturbations = perturbations[:3]
    ntc_cell_indices = np.where(adata.obs[target_column] == ntc_cells_delimiter)[0]

    # Create RNG object
    de_rng = np.random.default_rng(random_state)
    # Create one child RNG object per perturbation
    streams = de_rng.spawn(len(perturbations))

    # Determine how many CPUs to use per joblib Parallel job
    #   and per DE analysis run
    if n_cpus < 3:
        n_cpus_for_DE = n_cpus
    elif n_cpus <= 10:
        n_cpus_for_DE = 3
    elif 20 <= n_cpus < 30:
        n_cpus_for_DE = 4
    elif 30 <= n_cpus < 50:
        n_cpus_for_DE = 5
    else:
        n_cpus_for_DE = 6
    n_jobs = max(1, n_cpus // n_cpus_for_DE)

    # Warning if given layer not found
    if layer not in adata.layers:
         print(f"Warning: {layer} not in adata.layers, using adata.X instead")

    # Function to run DE analysis on a single perturbation
    quiet = True
    def process_perturbation(target_value: str, rng: np.random.Generator, quiet=quiet):
        # Generate pseudoreplicates
        pseudo_bulk_df, metadata_df = generate_pseudoreplicates_for_DE(
            adata, target_value, ntc_cell_indices, rng, 
            target_column=target_column, layer=layer
        )

        # Set number of CPUs to use for each DE run
        inference = DefaultInference(n_cpus=n_cpus_for_DE) 

        # Read counts modeling and fitting dispersions
        dds = DeseqDataSet(
            counts = pseudo_bulk_df, 
            metadata = metadata_df,
            refit_cooks=True,
            inference=inference,
            quiet=quiet
            )
        dds.deseq2()
        
        # Statistical testing
        target_hyphenated = target_value.replace('_', '-')
        contrast = ["condition", target_hyphenated, ntc_cells_delimiter]
        stat_res = DeseqStats(
            dds, 
            contrast=contrast, 
            alpha=alpha,
            inference=inference,
            quiet=quiet)
        stat_res.summary()
        if shrink_lfcs == True:
            # Shrink LFCs for downstream analysis
            stat_res.lfc_shrink(coeff=f"condition_{contrast[1]}_vs_{contrast[2]}", 
                                adapt=False)
        results_df = stat_res.results_df

        # Clean up memory
        del pseudo_bulk_df, metadata_df, dds, stat_res
        gc.collect()

        # Return results with and without LFC shrinkage
        return results_df

    # Run DE analysis on all perturbations in parallel
    with tqdm_joblib(desc="Running DE analysis", total=len(perturbations)):
        results = Parallel(n_jobs=n_jobs)(delayed(process_perturbation)(target_value, rng)
            for target_value, rng in zip(perturbations, streams))
    
    results_dict = dict(zip(perturbations, results))
    return results_dict

def build_de_df(results_dict: dict):
    """
    Build DF of all DE results: baseMean, log2FoldChange, lfcSE, 
    stat (LFC divided by LFC SE), pvalue, padj.
    """
    df_list = list()
    for gene_target, df in results_dict.items():
        # Sort genes alphabetically
        df = df.sort_index()
        # Prepend perturbation name to each column label
        new_col_names = {col: f"{gene_target}_{col}" for col in df}
        df = df.rename(columns=new_col_names)
        df_list.append(df)
    if len(df_list) > 1:
        # Concatenate all DE result DataFrames (using intersection of genes)
        de_df = pd.concat(df_list, axis=1)
        de_df = de_df.sort_index()
    return de_df

def build_deg_df(de_df: pd.DataFrame, 
                 perturbation: str, 
                 padj_threshold: float = 0.05,
                 lfc_threshold: float = 0):
    """
    Return DE results filtered to only the given perturbation and only
    genes that pass p-adj threshold (alpha) and LFC threshold.
    """
    perturbation_cols = de_df.columns[de_df.columns.str.contains(perturbation)]
    perturbation_df = de_df[perturbation_cols]
    perturbation_deg_df = \
        perturbation_df[
            (perturbation_df[f"{perturbation}_padj"] < padj_threshold) & 
            (abs(perturbation_df[f"{perturbation}_log2FoldChange"]) > lfc_threshold)
            ]
    return perturbation_deg_df


In [5]:
def benchmark_NTC_FDR(
    adata: ad.AnnData,
    gRNA_column: str = "gRNA",
    ntc_cells_delimiter: str = "Non-Targeting",
    layer: str = "counts",
    alpha: float = 0.05,
    n_cpus: int = 16,
    random_state: int = 42,
    debug: bool = True
):
    """
    Benchmark FDR control by comparing NTC sgRNAs against each other.
    For each NTC sgRNA, run DE analysis against all other NTC sgRNAs.
    Calculate distribution of DEGs (FDR < 0.05) per NTC sgRNA. 
    Determine threshold at which 95% of NTC sgRNAs have fewer DEGs.
    Add a column to adata.obs indicating if each perturbation exceeds
    the NTC FDR threshold. Not batch-aware. No results stored in adata.

    gRNA_column: adata.obs column with identifier for each sgRNA, default "gRNA"
    ntc_cells_delimiter: prefix of NTC sgRNA in gRNA column, default "Non-Targeting"
    layer: AnnData object layer to use for DE analysis, default "counts"
    alpha: significance threshold for DESeq2, default 0.05
    """
    # Get AnnData with only NTC cells based on gRNA_column and delimiter
    ntc_mask = adata.obs[gRNA_column].astype(str).str.contains(ntc_cells_delimiter)
    if not ntc_mask.any():
        raise ValueError(f"No NTC cells found in {gRNA_column} containing "
                         f"'{ntc_cells_delimiter}'")
    ntc_adata = adata[ntc_mask].copy()

    # Get unique NTC sgRNAs
    if gRNA_column not in ntc_adata.obs.columns:
        raise ValueError(f"Column '{gRNA_column}' not found in adata.obs")
    ntc_sgRNAs = list(ntc_adata.obs[gRNA_column].unique())
    if len(ntc_sgRNAs) < 3:
        print(f"Not enough unique NTC sgRNAs for benchmarking (need at least 3, found {len(ntc_sgRNAs)})")
        return
    if debug == True:
        print("Running in debug mode with 5 NTC sgRNA")
        ntc_sgRNAs = ntc_sgRNAs[:5]

    # Create RNG object to handle case where one is not provided
    de_rng = np.random.default_rng(random_state)
    # Create one child RNG object per NTC sgRNA
    streams = de_rng.spawn(len(ntc_sgRNAs))

    # Determine how many CPUs to use per joblib Parallel job
    #   and per DE analysis run
    if n_cpus < 3:
        n_cpus_for_DE = n_cpus
    elif n_cpus <= 10:
        n_cpus_for_DE = 3
    elif 20 <= n_cpus < 30:
        n_cpus_for_DE = 4
    elif 30 <= n_cpus < 50:
        n_cpus_for_DE = 5
    else:
        n_cpus_for_DE = 6
    n_jobs = max(1, n_cpus // n_cpus_for_DE)
    
    # Warning if given layer not found
    if layer not in adata.layers:
        print(f"Warning: {layer} not found in adata.layers, using adata.X instead")

    # Helper function
    def process_ntc_gRNA(target_ntc_gRNA: str,
                         rng: np.random.Generator):
        # Check if there are enough target cells
        target_cells = ntc_adata[ntc_adata.obs[gRNA_column] == target_ntc_gRNA].obs.index
        if len(target_cells) < 10:
            print(f"Skipping {target_ntc_gRNA}: too few cells ({len(target_cells)})")
            return
        
        # Get other NTC cells (excluding the target gRNA)
        other_ntc_cells = ntc_adata[ntc_adata.obs[gRNA_column] != target_ntc_gRNA].obs.index
        if len(other_ntc_cells) < 10:
            print(f"Skipping {target_ntc_gRNA}: too few other NTC cells ({len(other_ntc_cells)})")
            return
        
        # Temporarily create "perturbation" column to use with generate_pseudoreplicates_for_DE
        # This labels target gRNA cells as the "perturbation" and other NTC cells as "NTC"
        ntc_adata_temp = ntc_adata.copy()
        ntc_adata_temp.obs["temp_pert"] = "NTC"
        ntc_adata_temp.obs.loc[ntc_adata_temp.obs[gRNA_column] == target_ntc_gRNA, 'temp_pert'] = target_ntc_gRNA
        
        # Generate pseudoreplicates
        pseudo_bulk_df, metadata_df = generate_pseudoreplicates_for_DE(
            ntc_adata_temp, target_ntc_gRNA, other_ntc_cells, rng, 
            target_column="temp_pert", layer=layer
        )

        # Set number of CPUs to use for each DE run
        inference = DefaultInference(n_cpus=n_cpus_for_DE) 

        # Read counts modeling and fitting dispersions
        dds = DeseqDataSet(
            counts = pseudo_bulk_df, 
            metadata = metadata_df,
            refit_cooks=True,
            inference=inference,
            quiet=True
            )
        dds.deseq2()
        
        # Statistical testing
        target_hyphenated = target_ntc_gRNA.replace('_', '-')
        contrast = ["condition", target_hyphenated, ntc_cells_delimiter]
        stat_res = DeseqStats(
            dds, 
            contrast=contrast, 
            alpha=alpha,
            inference=inference,
            quiet=True)
        stat_res.summary()
        de_results = stat_res.results_df

        # Clean up memory
        del pseudo_bulk_df, metadata_df, dds, res, ntc_adata_temp
        gc.collect()

        return de_results

    # Run DE analysis on all NTC sgRNA in parallel
    with tqdm_joblib(desc="Running DE analysis", total=len(ntc_sgRNAs)):
        de_results = Parallel(n_jobs=n_jobs)(
            delayed(process_ntc_gRNA)(target_value, rng) 
            for target_value, rng in zip(ntc_sgRNAs, streams))
    results_dict = dict(zip(ntc_sgRNAs, de_results))

    # Check for successful comparisons
    if not results_dict:
        print("No successful comparisons. Check if the data layer contains integer counts.")
        return
    
    # Calculate DEGs per sgRNA-batch (handles sgRNA names with underscores)
    deg_counts = {}
    for key, result_df in results_dict.items():
        if result_df is None:
            continue
        count_key = key
        # Count DEGs for this comparison
        deg_count = sum((result_df['padj'] < alpha) & pd.notna(result_df['padj']))
        deg_counts[count_key] = deg_count

    # Check if we have any DEG counts
    if not deg_counts:
        print("No DEGs found in any comparison.")
        return

    # Create results DataFrame
    benchmark_results = pd.DataFrame.from_dict(deg_counts, orient='index', columns=['n_DEGs'])
    benchmark_results['sgRNA'] = benchmark_results.index

    benchmark_results.index.name = 'key'
    benchmark_results = benchmark_results.sort_values('n_DEGs', ascending=False)
    
    # Calculate 95th percentile threshold (FDR 0.05)
    if len(benchmark_results) > 0:
        deg_threshold = np.percentile(benchmark_results['n_DEGs'].values, 95)
    else:
        print("No results to calculate percentile.")
        return
    
    # Print statistics
    print(f"NTC Benchmark Statistics:")
    print(f"  • FDR 0.05 threshold: {int(deg_threshold)} DEGs")
    print(f"  • NTC comparisons analyzed: {len(benchmark_results)}")
    print(f"  • Mean DEGs per comparison: {benchmark_results['n_DEGs'].mean():.1f}")
    print(f"  • Median DEGs per comparison: {benchmark_results['n_DEGs'].median():.1f}")
    print(f"  • 95% of comparisons have < {int(deg_threshold)} DEGs")

    return int(deg_threshold), benchmark_results

### Test

In [3]:
data_path = "/home/ranwyl/data_tf_project/KOLF_Pan_Genome_Alpha_Knockdown_Energy_Test_Filtered.h5ad"
adata = psp.utils.read_anndata(data_path)
adata

AnnData object with n_obs × n_vars = 218299 × 20043
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'perturbation', 'gene_target_ensembl_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'perturbation_edist', 'perturbation_pvalue', 'perturbation_significant'
    var: 'gene_ids', 'feature_types', 'n_UMI_counts', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'etest_results', 'etest_results_params'
    layers: 'counts'

In [35]:
test_kds = ["POU5F1", "NANOG", "ZNF521", "SMARCE1", "POU3F2", "PURG", "HOXD13"]
test_adata = adata[(adata.obs.perturbed == "False") | 
                   (adata.obs.gene_target.isin(test_kds))].copy()
test_adata

AnnData object with n_obs × n_vars = 14349 × 20043
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'perturbation', 'gene_target_ensembl_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'perturbation_edist', 'perturbation_pvalue', 'perturbation_significant'
    var: 'gene_ids', 'feature_types', 'n_UMI_counts', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'etest_results', 'etest_results_params'
    layers: 'counts'

In [36]:
test_adata[test_adata.obs.perturbed == "True"].obs.gRNA.value_counts()

gRNA
PURG_1      94
ZNF521_2    93
ZNF521_1    83
NANOG_2     81
PURG_3      63
POU5F1_3    50
POU3F2_2    50
POU3F2_1    43
NANOG_1     38
POU5F1_2    29
POU5F1_1    27
Name: count, dtype: int64

In [37]:
test_de_results = differential_expression(test_adata)

Running DE analysis:   0%|          | 0/11 [00:00<?, ?it/s]

  self.fit_dispersion_prior()
  self.fit_dispersion_prior()


In [38]:
test_de_df = build_de_df(test_de_results)
test_de_df.head(3)

Unnamed: 0,POU5F1_1_baseMean,POU5F1_1_log2FoldChange,POU5F1_1_lfcSE,POU5F1_1_stat,POU5F1_1_pvalue,POU5F1_1_padj,NANOG_1_baseMean,NANOG_1_log2FoldChange,NANOG_1_lfcSE,NANOG_1_stat,...,NANOG_2_lfcSE,NANOG_2_stat,NANOG_2_pvalue,NANOG_2_padj,POU3F2_1_baseMean,POU3F2_1_log2FoldChange,POU3F2_1_lfcSE,POU3F2_1_stat,POU3F2_1_pvalue,POU3F2_1_padj
A1BG,0.754488,-1.180151,3.099653,-0.380737,0.703399,,0.621777,-1.514038,2.350871,-0.644033,...,1.293201,-0.879252,0.379264,,0.892897,1.740662,2.014336,0.864137,0.387513,0.997639
A1BG-AS1,0.0,,,,,,0.184549,-1.042994,4.241415,-0.245907,...,2.130628,-0.063235,0.94958,,0.0,,,,,
A2M,0.0,,,,,,0.0,,,,...,,,,,0.0,,,,,


In [9]:
def get_deg_set(gRNA, de_df, adata):
    deg_df = build_deg_df(de_df, gRNA)
    degs = set(deg_df.index.tolist())
    print(f"gRNA {gRNA} ({adata[adata.obs.gRNA == gRNA].shape[0]} cells) "
          f"has {len(degs)} DEGs")
    return deg_df, degs

In [10]:
POU5F1_1_deg_df, POU5F1_1_degs = get_deg_set("POU5F1_1", test_de_df, test_adata)
POU5F1_2_deg_df, POU5F1_2_degs = get_deg_set("POU5F1_2", test_de_df, test_adata)
POU5F1_3_deg_df, POU5F1_3_degs = get_deg_set("POU5F1_3", test_de_df, test_adata)
print(f"POU5F1 has {len(POU5F1_1_degs | POU5F1_2_degs | POU5F1_3_degs)} total DEGs")

gRNA POU5F1_1 (27 cells) has 29 DEGs
gRNA POU5F1_2 (29 cells) has 34 DEGs
gRNA POU5F1_3 (50 cells) has 92 DEGs
POU5F1 has 104 total DEGs


In [11]:
print(f"POU5F1_1 has {len(POU5F1_1_degs - POU5F1_2_degs - POU5F1_3_degs)} unique DEGs")
print(f"POU5F1_2 has {len(POU5F1_2_degs - POU5F1_1_degs - POU5F1_3_degs)} unique DEGs")
print(f"POU5F1_2 has {len(POU5F1_3_degs - POU5F1_2_degs - POU5F1_1_degs)} unique DEGs")

POU5F1_1 has 5 unique DEGs
POU5F1_2 has 6 unique DEGs
POU5F1_2 has 58 unique DEGs


In [39]:
test_adata_pipeline = test_adata.copy()
pipeline_de_results = psp.de.differential_expression(test_adata_pipeline, plot_degs=False)

Processing perturbations:   0%|          | 0/11 [00:00<?, ?it/s]




Updating adata object with DEG counts
2025-10-23 00:36:51 - INFO - Completed differential expression analysis in 117.68 seconds


In [40]:
pipeline_de_results["POU5F1_1"].head(3)

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
CTNNBIP1,10.810082,1.777345,0.504368,3.523907,0.000425,0.009745
NUDC,16.745314,-1.107411,0.36814,-3.008124,0.002629,0.036655
CSMD2,9.110559,-3.178829,0.692571,-4.589896,4e-06,0.000229


In [41]:
def print_DE_results(de_results, adata, alpha=0.05):
    gRNA_degs = dict()
    gRNAs = sorted(list(de_results.keys()))
    for gRNA in gRNAs:
        de_df = de_results[gRNA]
        degs = set(de_df[de_df["padj"] < alpha].index.tolist())
        deg_or_degs = "DEGs" if len(degs) != 1 else "DEG"
        print(f"gRNA {gRNA} ({adata[adata.obs.gRNA == gRNA].shape[0]} cells)"
              f" has {len(degs)} {deg_or_degs}")
        gRNA_degs[gRNA] = degs
    return gRNA_degs

In [42]:
print("Independent pseudoreplicate DE analysis:")
my_degs = print_DE_results(test_de_results, test_adata)

Independent pseudoreplicate DE analysis:
gRNA NANOG_1 (38 cells) has 1 DEG
gRNA NANOG_2 (81 cells) has 38 DEGs
gRNA POU3F2_1 (43 cells) has 0 DEGs
gRNA POU3F2_2 (50 cells) has 52 DEGs
gRNA POU5F1_1 (27 cells) has 29 DEGs
gRNA POU5F1_2 (29 cells) has 39 DEGs


gRNA POU5F1_3 (50 cells) has 95 DEGs
gRNA PURG_1 (94 cells) has 0 DEGs
gRNA PURG_3 (63 cells) has 1 DEG
gRNA ZNF521_1 (83 cells) has 43 DEGs
gRNA ZNF521_2 (93 cells) has 0 DEGs


In [43]:
print("Pipeline (bootstrapping method) DE analysis:")
pipeline_degs = print_DE_results(pipeline_de_results, test_adata_pipeline)

Pipeline (bootstrapping method) DE analysis:
gRNA NANOG_1 (38 cells) has 69 DEGs
gRNA NANOG_2 (81 cells) has 162 DEGs
gRNA POU3F2_1 (43 cells) has 0 DEGs
gRNA POU3F2_2 (50 cells) has 423 DEGs
gRNA POU5F1_1 (27 cells) has 463 DEGs
gRNA POU5F1_2 (29 cells) has 566 DEGs
gRNA POU5F1_3 (50 cells) has 516 DEGs
gRNA PURG_1 (94 cells) has 0 DEGs
gRNA PURG_3 (63 cells) has 174 DEGs
gRNA ZNF521_1 (83 cells) has 270 DEGs
gRNA ZNF521_2 (93 cells) has 0 DEGs


In [None]:
from psp.de.differential_expression import _save_DEG_df
pipeline_de_df = _save_DEG_df(pipeline_de_results, save=False)

In [None]:
pipeline_de_df.head()

Unnamed: 0,POU5F1_3_DEGs,POU5F1_3_L2FC,POU5F1_3_Adj_P,POU5F1_2_DEGs,POU5F1_2_L2FC,POU5F1_2_Adj_P,ZNF521_1_DEGs,ZNF521_1_L2FC,ZNF521_1_Adj_P,POU5F1_1_DEGs,...,POU5F1_1_Adj_P,NANOG_2_DEGs,NANOG_2_L2FC,NANOG_2_Adj_P,NANOG_1_DEGs,NANOG_1_L2FC,NANOG_1_Adj_P,ZNF521_2_DEGs,ZNF521_2_L2FC,ZNF521_2_Adj_P
0,VAT1L,-4.475211,0.042445,MGAT4C,-6.266861,0.02414,COX14,-3.915407,0.001929,ENSG00000260834,...,0.048577,ENSG00000228714,-4.129991,0.0007784607,MT1X,-2.055145,4.815207e-06,,,
1,USP31,-3.95413,0.010992,VRTN,-3.701271,0.018198,RAE1,-2.576468,0.042923,KCTD8,...,0.011971,ENSG00000253507,-2.51395,4.824561e-07,ADCY2,-1.807418,1.110138e-07,,,
2,LNCPRESS2,-3.92427,0.000737,GNL2,-3.499515,0.03384,ANTKMT,-2.573847,0.045386,UTP20,...,0.017877,FOXO3,-2.320816,0.003552015,ENSG00000289533,-1.727329,3.692167e-07,,,
3,MT2A,-3.785473,0.015254,RYR2,-3.44321,2.4e-05,SAMD4A,-2.271486,0.041627,LINC00698,...,0.001034,SLC7A11,-2.011189,0.01100124,ESRG,-1.680388,1.741802e-05,,,
4,UNC13A,-3.722482,0.000227,RRM2B,-3.425283,0.040953,MGAT4C,-1.918426,0.000167,OPCML,...,0.000307,ENSG00000253636,-1.95089,0.01409903,TMEM106C,-1.679746,0.001448709,,,


# Old DE code

Write out results

In [None]:
filepath_pkl = "/home/ranwyl/results_tf_project/DE_results_10_2_2025.pkl"
DE_results.to_pickle(filepath_pkl)

Rename genes that are listed by Ensembl ID but have a gene name

In [None]:
all_genes = DE_results.index.tolist()
ensg_genes = [g for g in all_genes if g.startswith("ENSG")]
print(len(ensg_genes))
print(ensg_genes[:5])

In [None]:
mg = get_client('gene')
ensembl_results_all = mg.querymany(ensg_genes, fields='symbol', species='human')

Manually search for the genes with duplicate hits on GeneCards. To break ties, use the highest GeneCards Inferred Functionality Score (GIFtS). If there is a tie between highest scoring gene symbols, keep the gene as its Ensembl ID.

In [None]:
# Change this
dup_hits = {'ENSG00000234352': 'LOC349160', 'ENSG00000249738':'IL12B-AS1', 'ENSG00000257545':'LOC100287944'}

Rename genes

In [None]:
ensembl_to_gene = dict()
for r in ensembl_results_all:
  if r.get('symbol'):
      ensembl_to_gene[r.get('query')] = r.get('symbol')
ensembl_to_gene.update(dup_hits) # Change duplicate hits
print(len(ensembl_to_gene))

In [None]:
def rename_ensembl_genes(de_df, ensembl_to_gene_dict):
    """
    Rename genes in DataFrame.
    """
    de_df_renamed = de_df.rename(index=ensembl_to_gene_dict)
    print(f"Converted {len(ensembl_to_gene_dict)} Ensembl IDs to gene symbols")
    return de_df_renamed

In [None]:
DE_results_renamed = rename_ensembl_genes(DE_results, ensembl_to_gene)

In [None]:
filepath_pkl = "/home/ranwyl/results_tf_project/DE_results_gene_names_09-2025.pkl"
DE_results_renamed.to_pickle(filepath_pkl)