In [16]:
# conda activate anndata

import numpy as np
import pandas as pd
import anndata as ad

In [17]:
intron_file = "/mnt/lareaulab/reliscu/data/GENCODE/GRCm39/psix_annotation/intron_file.tab.gz"
sdata = ad.read_h5ad("data/tasic_2018_ALM_STAR_SJ_counts_annotated.h5ad")

In [None]:
# pseudobulk_psi_data = "SyntheticDataset1_20pcntCells_35SD_200samples_SJ_pseudobulk_min_observed0.25_minPsi0.1_PSI"
# pseudobulk_psi = pd.read_csv(f"data/tasic_2018_ALM_STAR_{pseudobulk_psi_data}.csv", index_col=0)

# shared_events = pseudobulk_psi.index.intersection(sdata.var_names)
# sdata_filt = sdata[:, shared_events].copy()

In [None]:
SJ_counts_table = pd.DataFrame.sparse.from_spmatrix(
    sdata.X.T, columns=sdata.obs_names, index=sdata.var_names
)

In [None]:
# Note: this takes a while

In [None]:
events_i1 = pd.Index([x[:-3] for x in SJ_counts_table.index if '_I1' in x])
events_i2 = pd.Index([x[:-3] for x in SJ_counts_table.index if '_I2' in x])
events_se = pd.Index([x[:-3] for x in SJ_counts_table.index if '_SE' in x])

events = events_i1.intersection(events_i2).intersection(events_se)
i1_events = [x + '_I1' for x in events]
I1_table = SJ_counts_table.loc[i1_events]
I1_table.index = events

i2_events = [x + '_I2' for x in events]
I2_table = SJ_counts_table.loc[i2_events]
I2_table.index = events

se_events = [x + '_SE' for x in events]
SE_table = SJ_counts_table.loc[se_events]
SE_table.index = events

I1_filt = I1_table.index[I1_table.sum(axis=1) > 0]
I2_filt = I2_table.index[I2_table.sum(axis=1) > 0]
SE_filt = SE_table.index[SE_table.sum(axis=1) > 0]
filtered_events = I1_filt.intersection(I2_filt).intersection(SE_filt)

I1_table = I1_table.loc[filtered_events]
I2_table = I2_table.loc[filtered_events]
SE_table = SE_table.loc[filtered_events]

psi = ((I1_table + I2_table) /(2*SE_table + I1_table + I2_table)).fillna(0)
reads = SE_table + I1_table + I2_table

In [None]:
# Save as anndata object

psi_mat = psi.to_numpy(dtype=np.float32)
obs = sdata.obs
var = pd.DataFrame(index=psi.index)
  
adata_psi = ad.AnnData(
    X=psi_mat.T,
    obs=obs,
    var=var,
)

In [None]:
reads_df = reads.T.reindex(index=adata_psi.obs_names, columns=adata_psi.var_names)
adata_psi.layers["exon_counts"] = reads_df.to_numpy(dtype=np.float32)

In [None]:
adata_psi.write(f"data/tasic_2018_ALM_STAR_SJ_counts_annotated_PSI.hd5")