In [None]:
# Run on gpu5 for memory

# conda activate anndata

import os
import sys
import pandas as pd
import concurrent.futures

sys.path.append("/mnt/lareaulab/reliscu/code")

from junction2psi import *

os.chdir("/mnt/lareaulab/reliscu/projects/NSF_GRFP/data/scRNA-seq/BICCN/mouse_ACA")

Here I create pseudobulk splice junction count data, using the same cells in each sample as were used to generate the gene-level pseudobulk. These pseudobulk SJ counts will be used to calculate pseudobulk PSIs for each exon.

In [None]:
dtype = np.float64
intron_file = "/mnt/lareaulab/reliscu/data/GENCODE/GRCm39/psix_annotation/intron_file.tab.gz"
sj_dir = "/mnt/lareaulab/reliscu/projects/NSF_GRFP/data/scRNA-seq/tasic_2018/ALM/processed/STAR"
sdata = ad.read_h5ad("/mnt/lareaulab/reliscu/projects/NSF_GRFP/data/scRNA-seq/tasic_2018/ALM/tasic_2018_ALM_STAR_SJ_counts.h5ad")
pseudobulk_meta = pd.read_csv("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/data/SyntheticDatasets/SyntheticDataset1_20pcntCells_0.2pcntVar_100samples_legend_04-04-36.csv")

In [None]:
sj_name = []
sj_counts = []
with open(f"{sj_dir}/SJ.out.tab", "rb") as fh:
    for line in fh:
        sj_name, sj_counts = process_SJ_line(line, sj_name, sj_counts)
        
sj_table = pd.DataFrame({"Counts": sj_counts}, index=sj_name)
sj_table = sj_table.astype(dtype)
sj_table = pd.merge(intron_table, sj_table, # Subset to SJs in annotated intron table
                    left_on="intron", how="left", 
                    right_index=True).fillna(0).drop("intron", axis=1)

In [None]:
# Subset anndata to annotated exons

In [None]:
# Get SJ counts from each cell

# Note: this takes a while (~5000 cells)

intron_table = read_intron_file(intron_file)
cells = pseudobulk_meta['Cell.name'].tolist()


In [None]:
def sum_sj_counts(mask):
    cell_subset = sj_counts # Get cells used to make each pseudobulk sample
    pseudo_sample = pd.concat(cell_subset, axis=1).sum(axis=1) # Sum SJ counts across cells
    return pseudo_sample

In [None]:
# Sum SJ reads to make pseudobulk samples

cell_incl_status = pseudobulk_meta.iloc[:, 2:].T.values.astype(bool) # Each pseudobulk sample is comprised of a different subset of cells

pseudo_list = []
with concurrent.futures.ThreadPoolExecutor() as executor:
     pseudo_list = list(executor.map(sum_sj_counts, cell_incl_status))

In [None]:
sj_counts = pd.concat(pseudo_list, axis=1, keys=[f"Sample{i+1}" for i in range(len(pseudo_list))])
non_zero_count = (sj_counts.sum(axis=1) > 0).values # Subset to rows with > 0 counts
sj_counts_filtered = sj_counts.loc[non_zero_count]

In [None]:
outdir = "/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/data"
sj_counts_filtered.to_csv(f"{outdir}/sj_counts.csv")