# Notebook for MuData Creation from GE and ATSE AnnData

This notebook:
1. Reads and inspects ATSE and gene expression AnnData files.
2. Fixes NaNs in the splicing data.
3. Creates modality-specific `.obs`, `.var`, and `.layers` for each AnnData.
4. Creates a MuData object with modalities “rna”, “junc_counts”, “cell_by_junction_matrix”, 
    and “cell_by_cluster_matrix”.
5. Writes out the final MuData object for use with MULTIVISPLICE.

## 0. Set Paths and Configuration

In [1]:
ROOT_PATH = "/gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/MOUSE_SPLICING_FOUNDATION/MODEL_INPUT/052025/"

ATSE_DATA_PATH = ROOT_PATH + "aligned_splicing_data_20250513_035938.h5ad"
GE_DATA_PATH = ROOT_PATH + "aligned_gene_expression_data_20250513_035938.h5ad"
OUTPUT_MUDATA_PATH = ROOT_PATH + "aligned__ge_splice_combined_20250513_035938.h5mu"
REDO_JUNC_RATIO = False

print("ATSE data path:", ATSE_DATA_PATH)
print("GE data path:  ", GE_DATA_PATH)
print("Output MuData path:", OUTPUT_MUDATA_PATH)

ATSE data path: /gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/MOUSE_SPLICING_FOUNDATION/MODEL_INPUT/052025/aligned_splicing_data_20250513_035938.h5ad
GE data path:   /gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/MOUSE_SPLICING_FOUNDATION/MODEL_INPUT/052025/aligned_gene_expression_data_20250513_035938.h5ad
Output MuData path: /gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/MOUSE_SPLICING_FOUNDATION/MODEL_INPUT/052025/aligned__ge_splice_combined_20250513_035938.h5mu


## 1. Imports

In [4]:
import anndata as ad
import pandas as pd
import scipy.sparse as sp
import numpy as np
from scipy.sparse import csr_matrix, hstack, vstack
import h5py
import anndata as ad
import mudata as mu

## 2. Load ATSE and Gene Expression AnnData

In [5]:
atse_anndata = ad.read_h5ad(ATSE_DATA_PATH)
print("ATSE AnnData:", atse_anndata)

ATSE AnnData: AnnData object with n_obs × n_vars = 157418 × 34845
    obs: 'cell_id_index', 'age', 'cell_ontology_class', 'mouse.id', 'sex', 'subtissue', 'tissue', 'dataset', 'cell_name', 'cell_id', 'cell_clean', 'broad_cell_type', 'seqtech'
    var: 'junction_id', 'event_id', 'splice_motif', 'annotation_status', 'gene_name', 'gene_id', 'num_junctions', 'position_off_5_prime', 'position_off_3_prime', 'CountJuncs', 'junction_id_index'
    layers: 'cell_by_cluster_matrix', 'cell_by_junction_matrix', 'junc_ratio'


In [6]:
ge_anndata = ad.read_h5ad(GE_DATA_PATH)
print("GE AnnData:", ge_anndata)
print(ge_anndata.layers["length_norm"])

GE AnnData: AnnData object with n_obs × n_vars = 157418 × 19022
    obs: 'cell_id', 'age', 'cell_ontology_class', 'mouse.id', 'sex', 'tissue', 'dataset', 'batch', 'subtissue_clean', 'broad_cell_type', 'cell_id_index', 'cell_name', 'library_size'
    var: 'index', 'gene_name', 'gene_id', 'mean_transcript_length', 'mean_intron_length', 'num_transcripts', 'transcript_biotypes'
    obsm: 'X_library_size'
    layers: 'length_norm', 'log_norm', 'predicted_log_norm_tms', 'raw_counts'
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 593435257 stored elements and shape (157418, 19022)>
  Coords	Values
  (0, 21)	0.029308323563892142
  (0, 54)	0.0165346193592835
  (0, 137)	0.030781944014048136
  (0, 185)	0.004901960784313725
  (0, 340)	0.0016926201760324984
  (0, 372)	0.01990570979570456
  (0, 415)	0.000690766751093714
  (0, 419)	0.06815537745604965
  (0, 455)	0.0024908538958511714
  (0, 482)	0.10429304713019132
  (0, 510)	0.2241253644314869
  (0, 535)	0.0007496251874062968
  (0, 557

In [7]:
# rescale by overall median transcript length (didn't do this in preprocessing of GE AnnData)
ge_anndata.layers["length_norm"] = ge_anndata.layers["length_norm"] * np.median(ge_anndata.var["mean_transcript_length"])
# make sure to round down to get integer counts (this is CSR)
ge_anndata.layers["length_norm"].data = np.floor(ge_anndata.layers["length_norm"].data)
print(ge_anndata.layers["length_norm"])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 593435257 stored elements and shape (157418, 19022)>
  Coords	Values
  (0, 21)	51.0
  (0, 54)	29.0
  (0, 137)	54.0
  (0, 185)	8.0
  (0, 340)	3.0
  (0, 372)	35.0
  (0, 415)	1.0
  (0, 419)	120.0
  (0, 455)	4.0
  (0, 482)	184.0
  (0, 510)	397.0
  (0, 535)	1.0
  (0, 557)	5.0
  (0, 708)	120.0
  (0, 804)	133.0
  (0, 900)	47.0
  (0, 1018)	20.0
  (0, 1121)	37.0
  (0, 1146)	1.0
  (0, 1147)	17.0
  (0, 1178)	12.0
  (0, 1180)	109.0
  (0, 1189)	1.0
  (0, 1219)	7.0
  (0, 1231)	0.0
  :	:
  (157417, 18963)	14.0
  (157417, 18965)	45.0
  (157417, 18968)	2.0
  (157417, 18969)	67.0
  (157417, 18971)	55.0
  (157417, 18972)	52.0
  (157417, 18974)	31.0
  (157417, 18976)	1.0
  (157417, 18987)	5.0
  (157417, 18988)	53.0
  (157417, 18990)	52.0
  (157417, 18992)	1.0
  (157417, 18996)	8.0
  (157417, 18997)	9.0
  (157417, 18998)	1.0
  (157417, 19006)	35.0
  (157417, 19008)	20.0
  (157417, 19010)	0.0
  (157417, 19012)	9.0
  (157417, 19013)	6.0
  (157417,

In [8]:
# Recalculate library size using length normalized counts
ge_anndata.obsm["X_library_size"] = ge_anndata.layers["length_norm"].sum(axis=1)
print(ge_anndata.obsm["X_library_size"])

[[ 104530.]
 [ 415320.]
 [ 330365.]
 ...
 [ 873490.]
 [1368928.]
 [ 431221.]]




In [9]:
atse_anndata.obs

Unnamed: 0,cell_id_index,age,cell_ontology_class,mouse.id,sex,subtissue,tissue,dataset,cell_name,cell_id,cell_clean,broad_cell_type,seqtech
49089,0,18m,basal epithelial cell of tracheobronchial tree,18_47_F,female,,Trachea,TMS,A10_B000120,A10_B000120,A10_B000120,EPITHELIAL CELL,single_cell
146213,1,3m,bulge keratinocyte,3_39_F,female,Telogen,Skin,TMS,A10_B000126,A10_B000126,A10_B000126,KERATINOCYTE,single_cell
53084,2,3m,myeloid cell,3_38_F,female,Fat,SCAT,TMS,A10_B000127,A10_B000127,A10_B000127,MYELOID IMMUNE CELL,single_cell
60774,3,3m,basal cell,3_56_F,female,Mammary_Gland,Mammary_Gland,TMS,A10_B000166,A10_B000166,A10_B000166,BASAL CELL,single_cell
85264,4,18m,endothelial cell of coronary artery,18_53_M,male,RV,Heart,TMS,A10_B000169,A10_B000169,A10_B000169,ENDOTHELIAL CELL,single_cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22968,157413,2m,197_L5 IT CTX,184750,F,VISp,Brain_Non-Myeloid,AB,US-1250275_E2_S86,US-1250275_E2_S86,SRR16457632,Excitatory Neurons,single_nuclei
26583,157414,2m,204_L5/6 IT CTX,184756,M,VISp,Brain_Non-Myeloid,AB,US-1250275_E2_S87,US-1250275_E2_S87,SRR16457633,Excitatory Neurons,single_nuclei
17647,157415,2m,257_L5 PT CTX,185199,M,VISp,Brain_Non-Myeloid,AB,US-1250275_E2_S88,US-1250275_E2_S88,SRR16457635,Excitatory Neurons,single_nuclei
20549,157416,2m,204_L5/6 IT CTX,185200,F,VISp,Brain_Non-Myeloid,AB,US-1250275_E2_S89,US-1250275_E2_S89,SRR16457636,Excitatory Neurons,single_nuclei


In [10]:
ge_anndata.obs

Unnamed: 0,cell_id,age,cell_ontology_class,mouse.id,sex,tissue,dataset,batch,subtissue_clean,broad_cell_type,cell_id_index,cell_name,library_size
98264,A10_B000120,18m,basal epithelial cell of tracheobronchial tree,18_47_F,female,Trachea,tabula_muris_senis,tabula_muris_senis,,EPITHELIAL CELL,0,A10_B000120,59.409354
138404,A10_B000126,3m,bulge keratinocyte,3_39_F,female,Skin,tabula_muris_senis,tabula_muris_senis,Telogen,KERATINOCYTE,1,A10_B000126,234.908896
138405,A10_B000127,3m,myeloid cell,3_38_F,female,SCAT,tabula_muris_senis,tabula_muris_senis,Fat,MYELOID IMMUNE CELL,2,A10_B000127,186.789788
138406,A10_B000166,3m,basal cell,3_56_F,female,Mammary_Gland,tabula_muris_senis,tabula_muris_senis,Mammary_Gland,BASAL CELL,3,A10_B000166,1558.667439
103042,A10_B000169,18m,endothelial cell of coronary artery,18_53_M,male,Heart,tabula_muris_senis,tabula_muris_senis,RV,ENDOTHELIAL CELL,4,A10_B000169,95.256588
...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,US-1250275_E2_S86,2m,197_L5 IT CTX,184750,F,Brain_Non-Myeloid,allen_brain_exons,allen_brain_exons,VISp,Excitatory Neurons,157413,US-1250275_E2_S86,537.763486
139,US-1250275_E2_S87,2m,204_L5/6 IT CTX,184756,M,Brain_Non-Myeloid,allen_brain_exons,allen_brain_exons,VISp,Excitatory Neurons,157414,US-1250275_E2_S87,449.292871
140,US-1250275_E2_S88,2m,257_L5 PT CTX,185199,M,Brain_Non-Myeloid,allen_brain_exons,allen_brain_exons,VISp,Excitatory Neurons,157415,US-1250275_E2_S88,494.998073
141,US-1250275_E2_S89,2m,204_L5/6 IT CTX,185200,F,Brain_Non-Myeloid,allen_brain_exons,allen_brain_exons,VISp,Excitatory Neurons,157416,US-1250275_E2_S89,774.472866


## 3. Create `.var` DataFrames for Each Modality

Here we create modality-specific `.var` metadata. You might later use these to update the
corresponding AnnData objects inside the MuData container.

In [11]:
gene_expr_var = pd.DataFrame(
    {
        "ID": ge_anndata.var["gene_id"],  # from the GE AnnData
        "modality": "Gene_Expression",
    },
    index=ge_anndata.var.index
)

splicing_var = pd.DataFrame(
    {
        "ID": atse_anndata.var["junction_id"],  # from the ATSE AnnData
        "modality": "Splicing",
    },
    index=atse_anndata.var.index
)

ge_anndata.var = gene_expr_var.copy()
atse_anndata.var = splicing_var.copy()

## 4. Create a Common `.obs` DataFrame

You can decide which AnnData’s `.obs` to use (or merge them) if both contain the same information.
Here we assume ATSE and GE have matching `obs` indices; we take the ATSE `obs`.

In [12]:
common_obs = atse_anndata.obs.copy()
common_obs["modality"] = "paired"  # if needed; adjust as required
print("Common obs shape:", common_obs.shape)

# Update both AnnData objects:
ge_anndata.obs = common_obs.copy()
atse_anndata.obs = common_obs.copy()

Common obs shape: (157418, 14)


## 5. Compute or Fix Splicing `junc_ratio` Layer

Here we check if `junc_ratio` needs to be recomputed. It is computed as:
`junc_ratio = cell_by_junction_matrix / cell_by_cluster_matrix`
and any NaNs/Inf values are replaced by zeros.


In [13]:
# %% [markdown]
# ### 5.1 Build junc_ratio + psi_mask on the filtered data

# %%
import numpy as np
from scipy import sparse
from scipy.sparse import csr_matrix, issparse
import gc

# grab the splicing modality
splicing = atse_anndata  # if you later rename it to 'splicing', otherwise: atse_anndata

cell_by_junc    = splicing.layers["cell_by_junction_matrix"]
cell_by_cluster = splicing.layers["cell_by_cluster_matrix"]

# 1) ensure CSR format
if not issparse(cell_by_junc):
    cell_by_junc = csr_matrix(cell_by_junc)
if not issparse(cell_by_cluster):
    cell_by_cluster = csr_matrix(cell_by_cluster)

# 2) build psi_mask (1 wherever cluster>0)
mask = cell_by_cluster.copy()
mask.data = np.ones_like(mask.data, dtype=np.uint8)
splicing.layers["psi_mask"] = mask

# 3) compute junc_ratio = junction / cluster, nan→0
cj = cell_by_junc.toarray()
cc = cell_by_cluster.toarray()

junc_ratio = np.divide(
    cj,
    cc,
    out=np.zeros_like(cj, dtype=float),
    where=(cc != 0),
)
# 4) assign back as dense or sparse (dense is fine)
splicing.layers["junc_ratio"] = junc_ratio

print("New splicing layers:", list(splicing.layers.keys()))
print(f"  junc_ratio shape: {junc_ratio.shape}, psi_mask nnz: {mask.nnz}")

# 5) cleanup
del cell_by_junc, cell_by_cluster, cj, cc, mask
gc.collect()


New splicing layers: ['cell_by_cluster_matrix', 'cell_by_junction_matrix', 'junc_ratio', 'psi_mask']
  junc_ratio shape: (157418, 34845), psi_mask nnz: 1036427639


1020

In [14]:
print(atse_anndata.layers['junc_ratio'])

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.34285714 0.65714286]
 ...
 [1.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


## 6. Create a MuData Object

Instead of stacking into one AnnData, we create a MuData container.

For MULTIVISPLICE, the new setup expects modalities with the following keys:
- `rna` : gene expression counts,
- `junc_ratio` : raw splicing/junction count data,
- `cell_by_junction_matrix` and `cell_by_cluster_matrix` as additional layers.

We can use the GE AnnData for gene expression and the ATSE AnnData for all splicing-related data.
(If needed, make copies so that modalities are independent.)


Option 1: Use the GE AnnData for RNA and the ATSE AnnData for splicing modalities.
(You can also combine or pre-process further if desired.)

In [15]:
mdata = mu.MuData({
    "rna": ge_anndata,
    "splicing": atse_anndata
})

# assert "library_size" in ge_anndata.obs, "'library_size' not found in ge_anndata.obs"
mdata.obsm["X_library_size"] = ge_anndata.obsm["X_library_size"]

# # Confirm it's stored correctly
# print("Library size moved to mdata.obsm['library_size'] with shape:", mdata.obsm["library_size"].shape)


# List of shared obs fields to pull up
shared_obs_keys = [
    'cell_id', 'age', 'cell_ontology_class', 'mouse.id', 'sex', 'tissue', 'dataset', 'broad_cell_type', 'cell_id_index', 'cell_name', 'modality'
]

# We'll assume 'rna' modality has them all and they match 'splicing'
for key in shared_obs_keys:
    assert key in mdata["rna"].obs, f"{key} not found in 'rna' obs"
    assert key in mdata["splicing"].obs, f"{key} not found in 'splicing' obs"
    assert (mdata["rna"].obs[key] == mdata["splicing"].obs[key]).all(), f"{key} values differ between modalities"
    mdata.obs[key] = mdata["rna"].obs[key]
    
print("MuData object created with modalities:", list(mdata.mod.keys()))

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


MuData object created with modalities: ['rna', 'splicing']


## 7. Write Out the Final MuData Object

The combined MuData object is now ready for use with `MULTIVISPLICE`. Save it as an H5MU file.

In [16]:
mdata.write(OUTPUT_MUDATA_PATH)
print(f"MuData object written to {OUTPUT_MUDATA_PATH}")

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


MuData object written to /gpfs/commons/groups/knowles_lab/Karin/Leaflet-analysis-WD/MOUSE_SPLICING_FOUNDATION/MODEL_INPUT/052025/aligned__ge_splice_combined_20250513_035938.h5mu


## 8. Verify the Output

Read the MuData object back in to ensure everything is correct.

In [None]:
mdata_loaded = mu.read_h5mu(OUTPUT_MUDATA_PATH)
print("Loaded MuData modalities:", list(mdata_loaded.mod.keys()))
print(mdata_loaded)