- Xenium outputs information: [10x Genomics - output understanding](https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/analysis/xoa-output-understanding-outputs)

to-do:
- check spatialdata [library](https://spatialdata.scverse.org/en/latest/index.html#). It will be helpful for working on the spatial data. Apperantly solve the memory issues, and has loaders for all outputs as spatialdata object.



* [Gene Panel](#gene-panel)
* [Cell Feature Matrix](#cell-feature-matrix---adata)
* [Cell Summary](#cell-summary-file)
* [Nucleus Boundaries](#nucleus-boundaries)
* [Cell Boundaries](#cell-boundaries)
* [Transcripts](#transcript-data)
    * [Filter](#filter)

In [1]:
import scvelo as scv
import scanpy as sc
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt



In [1]:
import os
from pathlib import Path

In [2]:
notebook_dir = os.getcwd()
notebook_dir

'/Users/omercagatay/Desktop/Thesis/git_repo/SALMON/notebooks/data_outputs_exploration'

In [3]:
abs_path = str(Path(notebook_dir).parents[3])
os.chdir(abs_path)
os.getcwd()

'/Users/omercagatay/Desktop/Thesis'

# Xenium Outputs

In [18]:
data_dir = os.path.join(abs_path,"data/Xenium_V1_mouse_pup_outs") 

In [19]:
os.listdir(data_dir)

['morphology_focus.ome.tif',
 'nucleus_boundaries.csv.gz',
 'cell_feature_matrix.tar.gz',
 'analysis.tar.gz',
 'metrics_summary.csv',
 'analysis_summary.html',
 'gene_panel.json',
 'cells.zarr.zip',
 'morphology_mip.ome.tif',
 'transcripts.parquet',
 'nucleus_boundaries.parquet',
 'cells.csv.gz',
 'cell_feature_matrix.zarr.zip',
 'morphology.ome.tif',
 'cell_feature_matrix.h5',
 'transcripts.zarr.zip',
 'cell_boundaries.csv.gz',
 'experiment.xenium',
 'cells.parquet',
 'transcripts.csv.gz',
 'cell_boundaries.parquet',
 'analysis.zarr.zip']

## Gene Panel

In [20]:
import json

f = open('data/Xenium_V1_mouse_pup_outs/gene_panel.json') 

data = json.load(f)

In [22]:
# Create lists to store extracted information
gene = []
ensembl = []
cov = []
# Iterate through the JSON list to extract information
for i in data['payload']['targets']:
    if (i['type']['descriptor'] == "gene"): # Only collect info for genes, not controls
        gene_name = i['type']['data']['name']
        ensembl_id = i['type']['data']['id']
        coverage = str(i['info']['gene_coverage'])

        gene.append(gene_name)
        ensembl.append(ensembl_id)
        cov.append(coverage)

# Create output CSV file
out_df = pd.DataFrame(list(zip(gene, ensembl, cov)), columns=['Gene name', 'Ensembl ID', 'Gene coverage'])

In [23]:
out_df

Unnamed: 0,Gene name,Ensembl ID,Gene coverage
0,0610005C13Rik,ENSMUSG00000109644,6
1,1110017D15Rik,ENSMUSG00000028441,7
2,2610528A11Rik,ENSMUSG00000096001,8
3,5330417C22Rik,ENSMUSG00000040412,8
4,6330403K07Rik,ENSMUSG00000018451,8
...,...,...,...
374,Vsnl1,ENSMUSG00000054459,8
375,Vsx2,ENSMUSG00000021239,8
376,Vwf,ENSMUSG00000001930,8
377,Wif1,ENSMUSG00000020218,8


In [24]:
out_df.to_csv('data/Xenium_V1_mouse_pup_outs/gene_ensembleID.csv', index=False)

## Cell Feature Matrix - adata

In [35]:
adata = sc.read_10x_h5('data/Xenium_V1_mouse_pup_outs/cell_feature_matrix.h5') 
adata

AnnData object with n_obs × n_vars = 1355849 × 379
    var: 'gene_ids', 'feature_types', 'genome'

In [36]:
adata.var = adata.var.rename(columns={"gene_ids":"Ensemble ID"})
adata.var.reset_index(inplace=True, drop = False, names= "gene_name")


In [37]:
adata.var

Unnamed: 0,gene_name,Ensemble ID,feature_types,genome
0,0610005C13Rik,ENSMUSG00000109644,Gene Expression,Unknown
1,1110017D15Rik,ENSMUSG00000028441,Gene Expression,Unknown
2,2610528A11Rik,ENSMUSG00000096001,Gene Expression,Unknown
3,5330417C22Rik,ENSMUSG00000040412,Gene Expression,Unknown
4,6330403K07Rik,ENSMUSG00000018451,Gene Expression,Unknown
...,...,...,...,...
374,Vsnl1,ENSMUSG00000054459,Gene Expression,Unknown
375,Vsx2,ENSMUSG00000021239,Gene Expression,Unknown
376,Vwf,ENSMUSG00000001930,Gene Expression,Unknown
377,Wif1,ENSMUSG00000020218,Gene Expression,Unknown


In [53]:
adata.obs

aaaabgpa-1
aaaabjde-1
aaaacikn-1
aaaackbp-1
aaaacllf-1
...
oinfhojg-1
oinfhphb-1
oinfieej-1
oinfkcfi-1
oinfkihb-1


## Cell Summary File

- The cell summary file (cells.csv.gz) in gzipped CSV format contains data to help QC the transcript counts for each identified cell. The file contains one row for each cell, with the following columns:

In [27]:
df_cells = pd.read_csv("data/Xenium_V1_mouse_pup_outs/cells.csv.gz")

In [28]:
df_cells

Unnamed: 0,cell_id,x_centroid,y_centroid,transcript_counts,control_probe_counts,control_codeword_counts,unassigned_codeword_counts,deprecated_codeword_counts,total_counts,cell_area,nucleus_area
0,aaaabgpa-1,850.288391,10613.065430,338,0,0,0,0,338,235.264071,49.310627
1,aaaabjde-1,837.167175,10610.744141,319,0,0,0,0,319,246.598290,42.266252
2,aaaacikn-1,835.130371,10622.626953,252,0,0,0,0,252,152.041099,40.821251
3,aaaackbp-1,823.748047,10620.277344,276,0,0,0,0,276,232.283758,17.069063
4,aaaacllf-1,842.690186,10630.465820,196,0,0,0,0,196,153.170006,28.990314
...,...,...,...,...,...,...,...,...,...,...,...
1355844,oinfhojg-1,9908.918945,19650.691406,85,0,0,0,0,85,59.064377,15.804688
1355845,oinfhphb-1,9911.258789,19618.589844,106,0,0,0,0,106,35.402501,22.442657
1355846,oinfieej-1,9911.923828,19655.921875,70,0,0,0,0,70,32.331876,13.456563
1355847,oinfkcfi-1,9913.118164,19636.181641,64,0,0,0,0,64,32.106095,11.243907


## Nucleus Boundaries

In [39]:
df_nucleus = pd.read_csv("data/Xenium_V1_mouse_pup_outs/nucleus_boundaries.csv.gz")

In [40]:
df_nucleus

Unnamed: 0,cell_id,vertex_x,vertex_y
0,aaaabgpa-1,850.00000,10605.450
1,aaaabgpa-1,849.15000,10606.088
2,aaaabgpa-1,848.51250,10607.363
3,aaaabgpa-1,848.30005,10610.125
4,aaaabgpa-1,849.36255,10614.801
...,...,...,...
17623822,oinfkihb-1,9913.76300,19625.650
17623823,oinfkihb-1,9914.40000,19624.375
17623824,oinfkihb-1,9914.40000,19622.250
17623825,oinfkihb-1,9913.76300,19621.826


## Cell Boundaries

In [42]:
df_cell_bound = pd.read_csv("data/Xenium_V1_mouse_pup_outs/cell_boundaries.csv.gz")

In [43]:
df_cell_bound

Unnamed: 0,cell_id,vertex_x,vertex_y
0,aaaabgpa-1,844.47504,10600.5625
1,aaaabgpa-1,842.56250,10602.0510
2,aaaabgpa-1,845.11255,10610.9760
3,aaaabgpa-1,845.11255,10621.3880
4,aaaabgpa-1,845.75000,10622.6630
...,...,...,...
17625516,oinfkihb-1,9914.40000,19624.8000
17625517,oinfkihb-1,9914.40000,19622.6760
17625518,oinfkihb-1,9914.82500,19622.2500
17625519,oinfkihb-1,9914.82500,19620.5500


In [46]:
def transform_group(group):
    x_coords = group['vertex_x'].tolist()
    y_coords = group['vertex_y'].tolist()
    return pd.Series({
        'x1': x_coords[0],
        'x2': x_coords[1],
        'y1': y_coords[2],
        'y2': y_coords[3]
    })

In [None]:
result_df = df_cell_bound.groupby('cell_id').apply(transform_group).reset_index()

In [49]:
result_df

Unnamed: 0,cell_id,x1,x2,y1,y2
0,aaaabgpa-1,844.47504,842.56250,10610.976,10621.3880
1,aaaabjde-1,834.27500,831.08750,10608.426,10609.2750
2,aaaacikn-1,827.68750,830.87500,10631.588,10632.4375
3,aaaackbp-1,821.52500,816.00000,10619.900,10628.4000
4,aaaacllf-1,845.53754,838.10004,10632.650,10635.2000
...,...,...,...,...,...
1355844,oinfhojg-1,9910.36200,9907.81250,19648.389,19652.4260
1355845,oinfhphb-1,9911.42500,9907.81250,19619.062,19619.4880
1355846,oinfieej-1,9912.91300,9911.63800,19655.188,19655.4000
1355847,oinfkcfi-1,9913.97600,9910.78800,19636.275,19637.5500


## Transcript Data

- overlaps_nucleus = Binary value to indicate if the transcript falls within the segmented nucleus of the cell (1) or not (0)

In [3]:
import pandas as pd

In [4]:
# dask is helpfull while reading a big files 
import dask.dataframe as dd

In [5]:
transcripts = dd.read_parquet("data/Xenium_V1_mouse_pup_outs/transcripts.parquet")

In [20]:
transcripts['x_location'].max().compute()

11470.7236328125

## Filter 

- Filtered Area (µm^2): 4718255.63
- X,Y = ([1829.62 , 5763.16],[4351.43 , 5550.92])


In [31]:
filtered_df = transcripts[(transcripts["x_location"]>=1829.62) & 
            (transcripts["x_location"]<=5763.16) &
            (transcripts["y_location"]>=4351.43) &
            (transcripts["y_location"]<=5550.92) ]

<img src = "../../../../data/images/selected_location.png" width="600">

In [34]:
output_file_parquet = "data/Xenium_V1_mouse_pup_outs/transcripts_filtered.parquet"
output_file_csv = "data/Xenium_V1_mouse_pup_outs/transcripts_filtered.csv"

In [35]:
#filtered_df.to_parquet(output_file_parquet)
filtered_df.to_csv(output_file_csv,single_file=True)

['/Users/omercagatay/Desktop/Thesis/data/Xenium_V1_mouse_pup_outs/transcripts_filtered.csv']