In [1]:
# conda activate anndata

import os
import sys
import pandas as pd
import anndata as ad
import concurrent.futures

sys.path.append("/mnt/lareaulab/reliscu/code")

from junction2psi import *

os.chdir("/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM")

Here I create pseudobulk splice junction count data, using the same cells in each sample as were used to generate the gene-level pseudobulk. These pseudobulk SJ counts will be used to calculate pseudobulk PSIs for each exon.

In [2]:
pseudobulk_data = "SyntheticDataset1_20pcntCells_35SD_200samples"
pseudobulk_meta = pd.read_csv("data/SyntheticDatasets/SyntheticDataset1_20pcntCells_35SD_200samples_legend_12-54-23.csv")

In [3]:
dtype = np.float64
intron_file = "/mnt/lareaulab/reliscu/data/GENCODE/GRCm39/psix_annotation/intron_file.tab.gz"
sj_dir = "/mnt/lareaulab/reliscu/projects/NSF_GRFP/data/scRNA-seq/tasic_2018/ALM/processed/STAR"
sdata = ad.read_h5ad("/mnt/lareaulab/reliscu/projects/NSF_GRFP/data/scRNA-seq/tasic_2018/ALM/tasic_2018_ALM_STAR_SJ_counts.h5ad")


In [4]:
# Get junction annotations

intron_table = read_intron_file(intron_file)

sj_name = []
sj_counts = []
with open(f"{sj_dir}/SJ.out.tab", "rb") as fh:
    for line in fh:
        sj_name, sj_counts = process_SJ_line(line, sj_name, sj_counts)
sj_table = pd.DataFrame({"Counts": sj_counts}, index=sj_name)
sj_table = sj_table.astype(dtype)

sj_table.index = sj_table.index.astype(str)
intron_table['intron'] = intron_table['intron'].astype(str)
sj_table = pd.merge(intron_table, sj_table, # Subset to SJs in annotated intron table
                    left_on="intron", how="left", 
                    right_index=True).fillna(0)

# Track where rows of SJ count data end up after merging with annotations
# Some junctions (rows) will be duplicated becauase a junction can be associated with multiple exons
sdata.var['Row'] = np.arange(sdata.var.shape[0])
sj_merge = pd.merge(sdata.var, sj_table,
                    left_index=True, right_on="intron", how="inner")

In [5]:
sj_merge.index.is_unique

True

In [6]:
# Match rows of SJ single-cell counts to annotation data
# Note: some junctions (rows) will be duplicated becauase the same junction can be associated with multiple exons
sdata_filt = sdata[:,sj_merge['Row'].astype(int).tolist()].copy()
# Add annotations (exons become index)
sdata_filt.var = sj_merge

  utils.warn_names_duplicates("var")


In [7]:
sdata_filt.var.index.is_unique

True

In [8]:
sdata_filt.var_names.is_unique

True

In [8]:
# Now create SJ pseudobulk: 
# Get the cells comprising each pseudobulk gene expression sample, then sum up SJ counts over those cells

count_mat = pseudobulk_meta.iloc[:, 2:].astype(int).to_numpy() # shape: cells Ã— # pseudobulk samples 
cell_ids = pseudobulk_meta['Cell.name'].astype(str).tolist()

pseudo_list = []
for i in range(count_mat.shape[1]):
    cell_tally = count_mat[:,i]
    rep_labels = np.repeat(cell_ids, cell_tally) # Cells were drawn with replacement so there will be repeats
    idx = sdata_filt.obs_names.get_indexer(rep_labels)
    X = sdata_filt.X[idx, :]
    pseudo_list.append(X.sum(axis=0))

pseudo_array = np.vstack(pseudo_list).T

In [9]:
col_idx = [f"Sample{i+1}" for i in range(pseudo_array.shape[1])]
row_idx = list(sdata_filt.var_names)
pseudo_df = pd.DataFrame(pseudo_array, index=row_idx, columns=col_idx)

In [10]:
pseudo_df.shape

(136261, 200)

In [11]:
# Remove to jxns with 0 counts
pseudo_df_filt = pseudo_df.loc[(pseudo_df.sum(axis=1) > 0).values]

In [12]:
pseudo_df_filt.shape

(134164, 200)

In [16]:
sdata_filt.shape

(9573, 136261)

In [17]:
# Subset single-cell data to the same jxns:

mask = sdata_filt.var_names.isin(pseudo_df_filt.index)
sdata_refilt = sdata_filt[:, mask].copy()

In [18]:
sdata_refilt.shape

(9573, 134164)

In [14]:
outdir = "/mnt/lareaulab/reliscu/projects/NSF_GRFP/analyses/pseudobulk_test/tasic_2018/mouse_ALM/data"
pseudo_df_filt.to_csv(f"{outdir}/tasic_2018_ALM_STAR_{pseudobulk_data}_SJ_pseudobulk.csv")

In [None]:
# Save annotated SC SJ counts

sdata_refilt.write(f"data/tasic_2018_ALM_STAR_SJ_counts_annotated.h5ad")