# Transcription Factor Project - Differential Expression Analysis and Minimum Distortion Embedding (Pipeline Steps G-I)
**Robin Anwyl, UCSD Subramaniam Lab**

**Project Goal:** Analyze the hiPSC Perturb-seq dataset from the Mali lab (Nourreddine et al preprint) to investigate the effects of transcription factor knockouts (TF KOs)

**Notebook Description:** 
-  Dataset: QC'd TF KO (and NTC) dataset
-  Analysis: pseudobulk differential expression analysis (DEA), pairwise Pearson correlation matrix, minimum distortion embedding (MDE)
***

# Import statements and global random seed

In [40]:
import anndata as ad
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import scipy as sp
import gc
from tqdm.auto import tqdm
from statsmodels.stats.multitest import multipletests
from scipy.stats import mannwhitneyu
from scipy.stats import ks_2samp
from scipy.stats import gamma
from scipy.stats import spearmanr
import seaborn as sns
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
import networkx as nx
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
from pydeseq2.default_inference import DefaultInference
import pickle as pkl
from IPython.display import display  #type: ignore
from scipy.spatial.distance import pdist, squareform #type: ignore
import plotly.express as px #type: ignore
import plotly.graph_objects as go #type: ignore
import pymde #type: ignore
import plotly.io as pio #type: ignore
from sklearn.preprocessing import StandardScaler #type: ignore
from sklearn.neighbors import kneighbors_graph #type: ignore
from sklearn.manifold import SpectralEmbedding
import igraph as ig #type: ignore
import leidenalg #type: ignore
from biothings_client import get_client
import decoupler as dc
import sys
import os
sys.path.insert(0, "/home/ranwyl/data_tf_project/")
import perturb_seq_heuristic_pipeline as qc
import pan_genome_analysis_functions as qc2

# Set global random seed as safety measure for reproducibility
np.random.seed(92093)

# Step G: Differential expression analysis - partitioning method

We will carry out differential expression analysis with PyDESeq2 using a pseudobulk method. The cells for each KO are partitioned evenly into pseudoreplicates and compared to an equal number of NTC cells.

In [2]:
adata_alpha = ad.read_h5ad("/home/ranwyl/data_tf_project/Aggregate_ALPHA_Core_Cells_TF_KO.h5ad")
adata_beta = ad.read_h5ad("/home/ranwyl/data_tf_project/Aggregate_BETA_Core_Cells_TF_KO.h5ad")
print(f"Total cells per batch: ALPHA {adata_alpha.shape[0]}, BETA {adata_beta.shape[0]}")

Total cells per batch: ALPHA 65308, BETA 12604


In [1]:
def generate_pseudoreplicates_for_DE(adata, gene_target, ntc_cells, rng):
    """
    Generate n_reps independent (partitioned) pseudoreplicates for a given gene target 
    and matched NTC cells. Manually set NTC as reference for DE analysis.
    rng = numpy Generator object
    """
    data_matrix = adata.X
    target_indices = np.where(adata.obs['gene_target'] == gene_target)[0]
    target_bulk = []
    ntc_bulk = []
    sample_names = []
    control_names = []

    # Create gene target pseudoreplicates
    n_cells = len(target_indices)
    n_reps = 2 if n_cells <= 35 else 3
    rep_size = len(target_indices) // n_reps
    sample_size = rep_size * n_reps
    sampled_target_indices = rng.choice(target_indices, sample_size, replace=False)
    for rep in range(n_reps):
        start, stop = rep*rep_size, (rep+1)*rep_size
        rep_indices = sampled_target_indices[start:stop]
        target_profile = data_matrix[rep_indices].sum(axis=0)
        target_bulk.append(target_profile)
        sample_names.append(f"{gene_target}_rep_{rep+1}")

    # Create NTC pseudoreplicates
    sampled_ntc_indices = rng.choice(ntc_cells, sample_size, replace=False)
    for rep in range(n_reps):
        start, stop = rep*rep_size, (rep+1)*rep_size
        rep_indices = sampled_ntc_indices[start:stop]
        ntc_profile = data_matrix[rep_indices].sum(axis=0)
        ntc_bulk.append(ntc_profile)
        control_names.append(f"NTC_rep_{rep+1}")
    
    # Convert to DataFrame
    sample_names.extend(control_names)
    # Build counts DataFrame
    pseudo_bulk_df = pd.DataFrame(np.vstack(target_bulk + ntc_bulk), index=sample_names, columns=adata.var_names)
    pseudo_bulk_df = pseudo_bulk_df[pseudo_bulk_df.columns[pseudo_bulk_df.sum(axis=0)>=1]] #Remove any samples with 0s in both NTC and Perturbed Sample
    # Build metadata DataFrame
    metadata_records = [{'condition': sample.split('_')[0]} for sample in sample_names]
    metadata_df = pd.DataFrame(metadata_records, index=sample_names)
    # Set NTC as reference
    metadata_df["condition"] = pd.Categorical(
        metadata_df["condition"],
        categories=["NTC", gene_target],
        ordered=True
        )

    return pseudo_bulk_df, metadata_df


def de_analysis(data, metadata, contrast, alpha=0.05, n_cpus=16):
    """
    Run differential expression analysis on a single gene target using PyDESeq2.
    """
    inference = DefaultInference(n_cpus=n_cpus) # Only use n_cpus CPUs
    
    # Read counts modeling and fitting dispersions
    dds = DeseqDataSet(
        counts = data, 
        metadata = metadata, 
        design="~condition",
        refit_cooks=True,
        inference=inference,
        quiet=True
        )
    dds.deseq2()
    
    # Statistical testing
    stat_res = DeseqStats(
        dds, 
        contrast=contrast, 
        alpha=alpha,
        inference=inference,
        quiet=True)
    stat_res.summary()
    # Shrink LFCs for downstream analysis
    stat_res.lfc_shrink(coeff=f"condition[T.{contrast[1]}]", adapt=False) 
    results = stat_res.results_df
    return results


def de_analysis_dataset(adata, n_cpus=16, n_reps=2, 
                        gene_target_obs_column="gene_target", ntc_cells_delimiter="NTC", 
                        alpha=0.05, de_rng=None):
    """
    Run differential expression analysis on each gene target in the dataset.
    """
    # Reset adata.X to raw counts
    adata.X = adata.layers["counts"].copy()

    # Identify KD and NTC cells
    ntc_cells = np.where(adata.obs[gene_target_obs_column] == ntc_cells_delimiter)[0]
    gene_targets = list(adata.obs[gene_target_obs_column].unique())
    gene_targets.remove(ntc_cells_delimiter)  # Remove the control group
    n_gene_targets = len(gene_targets)

    # Create RNG object to handle case where one is not provided
    if de_rng is None:
        hard_coded_seed = 42
        de_rng = np.random.default_rng(hard_coded_seed)
    # Create one child RNG object per gene target
    streams = de_rng.spawn(n_gene_targets)

    # Function to run DE analysis on a single gene target
    def process_gene_target(gene_target, rng):
        pseudo_bulk_df, metadata_df = generate_pseudoreplicates_for_DE(
            adata, gene_target, ntc_cells, rng=rng, n_reps=n_reps
        )
        return de_analysis(
            pseudo_bulk_df, 
            metadata_df, 
            contrast=["condition", gene_target, ntc_cells_delimiter], 
            alpha=alpha,
            n_cpus=n_cpus)

    # Run DE analysis on all gene targets in parallel
    with tqdm_joblib(desc="Running DE analysis", total=len(gene_targets)):
        results = Parallel(n_jobs=n_cpus)(
            delayed(process_gene_target)(gene_target, rng) for gene_target, rng in zip(gene_targets, streams))
    
    results_dict = dict(zip(gene_targets, results))
    
    return results_dict

def build_de_df(results_dict):
    df_list = list()
    for gene_target, df in results_dict.items():
        df = df.sort_index() # Sort genes alphabetically
        # Prepend TF KO name to each column label
        new_col_names = {col: f"{gene_target}_{col}" for col in df}
        df = df.rename(columns=new_col_names)
        df_list.append(df)
    # Concatenate all DE result DataFrames (using intersection of genes)
    de_df = pd.concat(df_list, axis=1)
    de_df = de_df.sort_index() # Sort genes alphabetically
    return de_df

def build_deg_df(de_df, tf_kd, lfc_threshold=0, padj_threshold=0.05):
    tf_kd_cols = de_df.columns[de_df.columns.str.contains(tf_kd)]
    tf_kd_df = de_df[tf_kd_cols]
    tf_kd_deg_df = tf_kd_df[(abs(tf_kd_df[f"{tf_kd}_log2FoldChange"]) > lfc_threshold) 
                            & (tf_kd_df[f"{tf_kd}_padj"] < padj_threshold)]
    return tf_kd_deg_df

### Test

In [None]:
test_gene_targets = ["POU5F1", "NANOG", "NTC"]
adata_test = adata_alpha[adata_alpha.obs.gene_target.isin(test_gene_targets)].copy()
adata_test

AnnData object with n_obs × n_vars = 14034 × 20200
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'run', 'gene_target_ensembl_id', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'target_knockdown_z_score', 'ed_category', 'anomaly_score'
    var: 'gene_ids', 'feature_types', 'n_UMI_counts', 'n_cells', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mean', 'std', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: '

In [None]:
adata_test.obs.gene_target.unique().tolist()

['NTC', 'POU5F1', 'NANOG']

In [None]:
adata_test[adata_test.obs.gene_target == "POU5F1"].shape[0]

96

In [None]:
adata_test[adata_test.obs.gene_target == "NANOG"].shape[0]

96

In [None]:
results_dict_test_2reps = de_analysis_dataset(adata_test, n_cpus=25, n_reps=2)

Running DE analysis:   0%|          | 0/2 [00:00<?, ?it/s]

  self._fit_parametric_dispersion_trend(vst)
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()


In [None]:
test_de_df_2reps = build_de_df(results_dict_test_2reps)
test_de_df_2reps.head()

Unnamed: 0,POU5F1_baseMean,POU5F1_log2FoldChange,POU5F1_lfcSE,POU5F1_stat,POU5F1_pvalue,POU5F1_padj,NANOG_baseMean,NANOG_log2FoldChange,NANOG_lfcSE,NANOG_stat,NANOG_pvalue,NANOG_padj
A1BG,3.971359,-0.315204,1.01212,-0.635116,0.525353,,2.959269,-0.182179,1.008075,-0.376555,0.706505,
A1BG-AS1,0.229447,0.154118,1.967397,0.309621,0.756849,,0.986616,-0.010507,1.227681,-0.028067,0.977609,
A2M,0.498122,-0.019972,1.281926,-0.057165,0.954413,,,,,,,
A2M-AS1,0.229447,0.154118,1.967397,0.309621,0.756849,,0.239337,0.100094,2.483595,0.261954,0.793357,
A2ML1,1.5241,-0.519355,1.206868,-1.133351,0.257067,,1.985454,-0.216242,1.07907,-0.473599,0.635786,


In [None]:
pou5f1_df_2reps = build_deg_df(test_de_df_2reps, "POU5F1", lfc_threshold=0)
print(pou5f1_df_2reps.shape[0])
nanog_df_2reps = build_deg_df(test_de_df_2reps, "NANOG", lfc_threshold=0)
print(nanog_df_2reps.shape[0])

18
41


In [None]:
results_dict_test_3reps = de_analysis_dataset(adata_test, n_cpus=25, n_reps=3)

Running DE analysis:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
test_de_df_3reps = build_de_df(results_dict_test_3reps)
test_de_df_3reps.head()

Unnamed: 0,POU5F1_baseMean,POU5F1_log2FoldChange,POU5F1_lfcSE,POU5F1_stat,POU5F1_pvalue,POU5F1_padj,NANOG_baseMean,NANOG_log2FoldChange,NANOG_lfcSE,NANOG_stat,NANOG_pvalue,NANOG_padj
A1BG,2.638843,-0.42673,0.892003,-0.801785,0.422677,,1.954981,-0.215939,0.973807,-0.432866,0.665112,
A1BG-AS1,0.149793,0.122497,2.188104,0.185412,0.852906,,0.64503,-0.015579,1.18821,-0.037305,0.970242,
A2M,0.321964,-0.017672,1.315148,-0.044368,0.964611,,,,,,,
A2M-AS1,0.154794,0.123601,2.19271,0.185412,0.852906,,0.153256,0.126667,2.179549,0.205352,0.837297,
A2ML1,1.014696,-0.54194,1.209727,-1.124439,0.260827,,1.316343,-0.242587,1.043105,-0.506768,0.612318,


In [None]:
pou5f1_df_3reps = build_deg_df(test_de_df_3reps, "POU5F1", lfc_threshold=0)
print(pou5f1_df_3reps.shape[0])
nanog_df_3reps = build_deg_df(test_de_df_3reps, "NANOG", lfc_threshold=0)
print(nanog_df_3reps.shape[0])

443
64


Remove genes expressed in <100 cells

In [None]:
adata = ad.concat([adata_alpha, adata_beta])
adata

AnnData object with n_obs × n_vars = 77912 × 20200
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'run', 'gene_target_ensembl_id', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'target_knockdown_z_score', 'ed_category', 'anomaly_score'
    obsm: 'X_pca'
    layers: 'counts', 'normalized_counts'

In [None]:
adata_genes_filt = adata.copy()
sc.pp.filter_genes(adata_genes_filt, min_cells=100)

filtered out 311 genes that are detected in less than 100 cells


In [None]:
adata_genes_filt

AnnData object with n_obs × n_vars = 77912 × 19889
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'run', 'gene_target_ensembl_id', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'target_knockdown_z_score', 'ed_category', 'anomaly_score'
    var: 'n_cells'
    obsm: 'X_pca'
    layers: 'counts', 'normalized_counts'

In [None]:
adata_alpha_filt = adata_genes_filt[adata_genes_filt.obs.run == "ALPHA"].copy()
adata_beta_filt = adata_genes_filt[adata_genes_filt.obs.run == "BETA"].copy()

In [None]:
test_gene_targets = ["POU5F1", "NANOG", "NTC"]
adata_test_filt = adata_alpha_filt[adata_alpha_filt.obs.gene_target.isin(test_gene_targets)].copy()
adata_test_filt

AnnData object with n_obs × n_vars = 14034 × 19889
    obs: 'gRNA', 'n_gRNA', 'n_gRNA_UMIs', 'gene_target', 'celltype', 'perturbation_type', 'n_UMI_counts', 'n_genes', 'perturbed', 'channel', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'run', 'gene_target_ensembl_id', 'gene_target_expression (CPM)', 'NTC_target_gene_expression (CPM)', 'target_knockdown', 'target_knockdown_z_score', 'ed_category', 'anomaly_score'
    var: 'n_cells'
    obsm: 'X_pca'
    layers: 'counts', 'normalized_counts'

In [None]:
results_dict_test_2reps_filt = de_analysis_dataset(adata_test_filt, n_cpus=25, n_reps=2)

Running DE analysis:   0%|          | 0/2 [00:00<?, ?it/s]

  self._fit_parametric_dispersion_trend(vst)
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()


In [None]:
test_de_df_2reps_filt = build_de_df(results_dict_test_2reps_filt)
pou5f1_df_2reps = build_deg_df(test_de_df_2reps_filt, "POU5F1", lfc_threshold=0)
print(pou5f1_df_2reps.shape[0])
nanog_df_2reps = build_deg_df(test_de_df_2reps_filt, "NANOG", lfc_threshold=0)
print(nanog_df_2reps.shape[0])

19
41


In [None]:
results_dict_test_3reps_filt = de_analysis_dataset(adata_test_filt, n_cpus=25, n_reps=3)

Running DE analysis:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
test_de_df_3reps_filt = build_de_df(results_dict_test_3reps_filt)
pou5f1_df_3reps = build_deg_df(test_de_df_3reps_filt, "POU5F1", lfc_threshold=0)
print(pou5f1_df_3reps.shape[0])
nanog_df_3reps = build_deg_df(test_de_df_3reps_filt, "NANOG", lfc_threshold=0)
print(nanog_df_3reps.shape[0])

446
64


Batch BETA

In [None]:
def build_deg_df(de_df, tf_kd, lfc_threshold=0, padj_threshold=0.05):
    tf_kd_cols = de_df.columns[de_df.columns.str.contains(tf_kd)]
    tf_kd_df = de_df[tf_kd_cols]
    tf_kd_deg_df = tf_kd_df[(abs(tf_kd_df[f"{tf_kd}_log2FoldChange"]) > lfc_threshold) 
                            & (tf_kd_df[f"{tf_kd}_padj"] < padj_threshold)]
    return tf_kd_deg_df

def count_degs(de_df, tf_kd, lfc_threshold=0, padj_threshold=0.05):
    deg_df = build_deg_df(de_df, tf_kd, lfc_threshold, padj_threshold)
    return deg_df.shape[0]

def count_degs_for_tf_kd_list(de_df, tf_kd_list, lfc_threshold=0, padj_threshold=0.05):
    for tf_kd in tf_kd_list:
        n_degs = count_degs(de_df, tf_kd, lfc_threshold, padj_threshold)
        if n_degs == 1:
            print(f"{tf_kd} has {n_degs} DEG")
        else:
            print(f"{tf_kd} has {n_degs} DEGs")

In [None]:
beta_kds = adata_beta_filt.obs.gene_target.unique().to_list()
beta_kds.remove("NTC")
print(f"Batch BETA has {len(beta_kds)} unique TF KDs")

Batch BETA has 7 unique TF KDs


In [None]:
print(f"Batch BETA TF KDs: {', '.join(beta_kds)}")

Batch BETA TF KDs: SNAPC5, TRAFD1, RBCK1, NAIF1, MTERF4, PIN1, ZBED6


In [None]:
beta_filt_cells_per_kd = adata_beta_filt.obs.gene_target.value_counts()
beta_filt_cells_per_kd = beta_filt_cells_per_kd.drop("NTC")
beta_filt_cells_per_kd

gene_target
PIN1      78
TRAFD1    77
RBCK1     52
SNAPC5    51
NAIF1     33
MTERF4    32
ZBED6     26
Name: count, dtype: int64

In [None]:
results_dict_beta = de_analysis_dataset(adata_beta_filt, n_cpus=25, n_reps=2)

Running DE analysis:   0%|          | 0/7 [00:00<?, ?it/s]

  self._fit_parametric_dispersion_trend(vst)
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()
  self.fit_dispersion_prior()


In [None]:
beta_DE_df = build_de_df(results_dict_beta)

In [None]:
beta_DE_df.head()

Unnamed: 0,SNAPC5_baseMean,SNAPC5_log2FoldChange,SNAPC5_lfcSE,SNAPC5_stat,SNAPC5_pvalue,SNAPC5_padj,TRAFD1_baseMean,TRAFD1_log2FoldChange,TRAFD1_lfcSE,TRAFD1_stat,...,PIN1_lfcSE,PIN1_stat,PIN1_pvalue,PIN1_padj,ZBED6_baseMean,ZBED6_log2FoldChange,ZBED6_lfcSE,ZBED6_stat,ZBED6_pvalue,ZBED6_padj
A1BG,2.090794,0.657907,1.613191,1.318879,0.18721,0.99514,3.23531,-0.259582,1.013076,-0.531073,...,0.953159,0.174302,0.861628,0.997742,1.999285,-0.548611,1.104251,-1.04738,0.294925,0.998534
A1BG-AS1,,,,,,,0.252513,0.097314,2.543327,0.262381,...,2.238469,0.638538,0.523123,0.997742,,,,,,
A2M,0.690982,0.285339,2.327042,0.769845,0.441392,0.99514,1.239847,0.083991,1.278528,0.219586,...,1.446661,0.345858,0.729449,0.997742,0.506277,-0.338132,1.194558,-0.749747,0.453407,0.998534
A2M-AS1,0.232651,0.094468,2.500232,0.235438,0.813869,0.99514,,,,,...,2.485085,0.284172,0.776278,0.997742,0.234035,-0.102118,1.109997,-0.277981,0.781027,0.998534
A2ML1,1.469387,0.126239,1.343993,0.337321,0.735875,0.99514,2.630036,0.178879,1.150894,0.385241,...,2.238469,0.638538,0.523123,0.997742,0.529991,-0.008621,1.30798,-0.025116,0.979963,0.998534


In [None]:
count_degs_for_tf_kd_list(beta_DE_df, beta_kds)

SNAPC5 has 0 DEGs
TRAFD1 has 0 DEGs
RBCK1 has 0 DEGs
NAIF1 has 0 DEGs
MTERF4 has 1 DEG
PIN1 has 0 DEGs
ZBED6 has 0 DEGs


In [None]:
results_dict_beta_3reps = de_analysis_dataset(adata_beta_filt, n_cpus=25, n_reps=3)

Running DE analysis:   0%|          | 0/7 [00:00<?, ?it/s]

  self._fit_parametric_dispersion_trend(vst)


In [None]:
beta_DE_df_3reps = build_de_df(results_dict_beta)

In [None]:
count_degs_for_tf_kd_list(beta_DE_df_3reps, beta_kds)

SNAPC5 has 0 DEGs
TRAFD1 has 0 DEGs
RBCK1 has 0 DEGs
NAIF1 has 0 DEGs
MTERF4 has 1 DEG
PIN1 has 0 DEGs
ZBED6 has 0 DEGs


# Old DE code

Combine all DE results into one DataFrame

In [None]:
DE_results = pd.concat([alpha_DE_df, beta_DE_df], axis=1)
DE_results = aggregate_DE_df.sort_index()
DE_results.head()

Write out results

In [None]:
filepath_pkl = "/home/ranwyl/results_tf_project/DE_results_10_2_2025.pkl"
DE_results.to_pickle(filepath_pkl)

Rename genes that are listed by Ensembl ID but have a gene name

In [None]:
all_genes = DE_results.index.tolist()
ensg_genes = [g for g in all_genes if g.startswith("ENSG")]
print(len(ensg_genes))
print(ensg_genes[:5])

In [None]:
mg = get_client('gene')
ensembl_results_all = mg.querymany(ensg_genes, fields='symbol', species='human')

Manually search for the genes with duplicate hits on GeneCards. To break ties, use the highest GeneCards Inferred Functionality Score (GIFtS). If there is a tie between highest scoring gene symbols, keep the gene as its Ensembl ID.

In [None]:
# Change this
dup_hits = {'ENSG00000234352': 'LOC349160', 'ENSG00000249738':'IL12B-AS1', 'ENSG00000257545':'LOC100287944'}

Rename genes

In [None]:
ensembl_to_gene = dict()
for r in ensembl_results_all:
  if r.get('symbol'):
      ensembl_to_gene[r.get('query')] = r.get('symbol')
ensembl_to_gene.update(dup_hits) # Change duplicate hits
print(len(ensembl_to_gene))

In [None]:
def rename_ensembl_genes(de_df, ensembl_to_gene_dict):
    """
    Rename genes in DataFrame.
    """
    de_df_renamed = de_df.rename(index=ensembl_to_gene_dict)
    print(f"Converted {len(ensembl_to_gene_dict)} Ensembl IDs to gene symbols")
    return de_df_renamed

In [None]:
DE_results_renamed = rename_ensembl_genes(DE_results, ensembl_to_gene)

In [None]:
filepath_pkl = "/home/ranwyl/results_tf_project/DE_results_gene_names_09-2025.pkl"
DE_results_renamed.to_pickle(filepath_pkl)

# Step H: Batch Correction

In [None]:
adata_alpha = ad.read_h5ad("/home/ranwyl/data_tf_project/Aggregate_ALPHA_Core_Cells.h5ad")
adata_beta = ad.read_h5ad("/home/ranwyl/data_tf_project/Aggregate_BETA_Core_Cells.h5ad")
adata_gamma = ad.read_h5ad("/home/ranwyl/data_tf_project/Aggregate_GAMMA_Core_Cells.h5ad")

In [None]:
# Filter TF KO and NTC cells
def filter_tf_ko_and_ntc(adata):
    return adata[(adata.obs["gene_target"].isin(tfs)) | (adata.obs["gene_target"] == "NTC")].copy()

# Remove lowly expressed genes based on list
def filter_low_expr_genes(adata, genes_to_keep_list):
    return adata[:,adata.var.index.isin(genes_to_keep_list)].copy()

def filter_cells_and_genes(adata, genes_to_keep_list):
    adata = filter_tf_ko_and_ntc(adata)
    return filter_low_expr_genes(adata, genes_to_keep_list)

# Genes that passed filtering out lowly expressed genes
genes_to_keep = pd.read_pickle("/home/ranwyl/data_tf_project/genes_filtered_30pct_100cells.pkl")
genes_to_keep = genes_to_keep[0].tolist()

In [None]:
adata_alpha_filtered = filter_cells_and_genes(adata_alpha, genes_to_keep)
adata_beta_filtered = filter_cells_and_genes(adata_beta, genes_to_keep)
adata_gamma_filtered = filter_cells_and_genes(adata_gamma, genes_to_keep)

In [None]:
adata_combined = ad.concat([adata_alpha_filtered, adata_beta_filtered, adata_gamma_filtered])
adata_combined.X = adata_combined.layers["counts"].copy()
adata_combined

Normalizate to median UMI count of all NTC cells

In [None]:
median_NTC_UMIs = np.median(qc2._get_ntc_view(adata_combined).obs.n_UMI_counts)
sc.pp.normalize_total(adata_combined, target_sum=median_NTC_UMIs)

Perform log1p transformation and batch correction

In [None]:
sc.pp.log1p(adata_combined)

# Batch correction via relative z-normalization
a = adata_combined[adata_combined.obs.run == 'ALPHA'].copy()
b = adata_combined[adata_combined.obs.run == 'BETA'].copy()
c = adata_combined[adata_combined.obs.run == 'GAMMA'].copy()
qc2.relative_z_normalization(a)
qc2.relative_z_normalization(b)
qc2.relative_z_normalization(c)

normalized_adata = ad.concat([a,b,c])
normalized_adata

In [None]:
normalized_adata.write("/home/ranwyl/data_tf_project/Final_Aggregate_TF_KO_NTC_Batch_Normalized.h5ad")