# Aim of the notebook
In this notebook we will convert a SpatialData object to an anndata containing sufficient information to perform spatial RNA velocity

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

We define the path where the SpatialData object is stored (as zarr)

In [2]:
def xenium_to_adata_for_velocity(path:str='',max_dist_to_nucleus='all'):
    """ Given the path of a Xenium dataset generated by the machine, format it to adata
    
    """
    #read adata
    adata=sc.read_10x_h5(path+'/cell_feature_matrix.h5')
    adata.var = adata.var.rename(columns={"gene_ids":"Ensemble ID"})
    adata.var.reset_index(inplace=True, drop = False, names= "gene_name")
    adata.obs.reset_index(inplace=True, drop=False, names= "cell_id")
    df_cells = pd.read_csv(path+"/cells.csv.gz")
    df_adata_obs = df_cells[['cell_id','x_centroid',"y_centroid", "transcript_counts", "total_counts", "cell_area", "nucleus_area"]]
    adata.obs = df_adata_obs.copy()
    del df_adata_obs
    del df_cells
    #read transcripts
    transcripts = pd.read_csv(path+"/transcripts.csv.gz")
    transcripts["cell_mapped"] = transcripts["cell_id"].apply(lambda x: "unassigned" if x == "UNASSIGNED" else "assigned")
    # check proportion of transcripts assigned to a cell 
    print(transcripts["cell_mapped"].value_counts())
    print('Percentage of unassigned_transcripts',np.round((transcripts["cell_mapped"].value_counts()[1]/len(transcripts))*100,1), "%")
    tr = transcripts[transcripts["cell_id"] != "UNASSIGNED" ]
    tr.reset_index(inplace=True, drop=True)
    # keep only counts situated at less than a certain distance to the nuclei edge
    if max_dist_to_nucleus!='all':
        tr=tr[tr['nucleus_distance']<max_dist_to_nucleus]
    
    # divide data in nuclei and cytopasm
    tn = tr[tr['overlaps_nucleus'] == 1]
    tc = tr[tr['overlaps_nucleus'] == 0]

    # create the cellxgene matrix for nuc counts and cytoplasmic
    nuc = pd.crosstab(tn['cell_id'],tn['feature_name'])
    cyt = pd.crosstab(tc['cell_id'],tc['feature_name'])

    # filer adata to keep only common cells 
    cell_tr = pd.crosstab(tr['cell_id'],tr['feature_name'])
    adata = adata[adata.obs['cell_id'].isin(cell_tr.index)]
    adata.obs.index = adata.obs.index.astype(str)
    
    # eliminate genes  that are not present in nuc and cyt
    adata.obs.index = adata.obs.index.astype(str)
    adata.var.index = adata.var.index.astype(str)
    adata = adata[:,adata.var['gene_name'].isin(nuc.columns)]
    adata.obs.index = adata.obs.index.astype(str)
    adata.var.index = adata.var.index.astype(str)
    adata = adata[:,adata.var['gene_name'].isin(cyt.columns)]
    adata.obs.index = adata.obs.index.astype(str)
    adata.var.index = adata.var.index.astype(str)
    nuc.index=nuc.index.astype(str)
    cyt.index=cyt.index.astype(str)
    
    #get the cells which has a transcripts mappep to it 
    adata = adata[adata.obs['cell_id'].isin(nuc.index)]
    adata.obs.index = adata.obs.index.astype(str)
    adata.var.index = adata.var.index.astype(str)
    adata = adata[adata.obs['cell_id'].isin(cyt.index)]
    
    # create "spliced", "unspliced" layers since scvelo looks for them
    nucsort = nuc.loc[adata.obs['cell_id'],adata.var["gene_name"]]
    cytsort = cyt.loc[adata.obs['cell_id'],adata.var["gene_name"]]

    # define nuclear and cytoplasmic counts in layers. Nuclear counts are stores in 'unspliced' and cytoplasmic counts are stored in 'spliced'
    adata.layers['spliced'] = np.array(cytsort)
    adata.layers['unspliced'] = np.array(nucsort)

    # the counts of all cells are considered the sum of spliced and unspliced
    adata.X=adata.layers['spliced']+adata.layers['unspliced']
    adata.obsm['spatial']=np.array(adata.obs.loc[:,['x_centroid','y_centroid']])
    return adata

In [3]:
def set_default_X(adata:'Anndata',to='all'):
    ''' Set the default expression matrix to either 'all', the sum of spliced and spliced, 'spliced' or 'unspliced'
     Parameters
    ----------
    adata:'AnnData object'
        Adata object including cell expression, unspiced and spliced matrices
    to:'str'
        Expression matrix to set up as a default in .X. It can be ither 'all' for the sum of expression and spliced,
        'unspliced' or 'spliced' for using only part of the counts

    Returns
    -------
    adata:'AnnData object'
        Adata object with redefined X    
    '''
    
    if to=='all':
        adata.X=adata.layers['spliced']+adata.layers['unspliced']
    if to=='spliced':
        adata.X=adata.layers['spliced']
    if to=='unspliced':
        adata.X=adata.layers['unspliced']
    
    return adata

# Define paths

In [70]:
path='/media/sergio/Meninges/meninges/20230705__20230705_MENINGES_ELIN_run2/output-XETG00047__0005264__HE27a__20230705__134603'
path_to_write='/media/sergio/Meninges/SpatialData_formatted/'

In [None]:
adata=xenium_to_adata_for_velocity(path:str='',max_dist_to_nucleus=1000)

In [54]:
adata=set_default_X(adata,to='all')

AnnData object with n_obs × n_vars = 286581 × 389
    obs: 'cell_id', 'x_centroid', 'y_centroid', 'transcript_counts', 'total_counts', 'cell_area', 'nucleus_area'
    var: 'gene_name', 'Ensemble ID', 'feature_types', 'genome'
    layers: 'spliced', 'unspliced'

In [None]:
adata.write(path_to_write+'adata_for_velocity.h5ad')