In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
#import squidpy as sq
from tqdm import tqdm

In [2]:
import sys
import os 
from pathlib import Path

In [3]:
notebook_dir = os.getcwd()
notebook_dir

'/Users/omercagatay/Desktop/Thesis/git_repo/SALMON/notebooks/metrics'

In [4]:
Path(notebook_dir).parents[3]

PosixPath('/Users/omercagatay/Desktop/Thesis')

In [5]:
os.chdir(Path(notebook_dir).parents[3])

# Read Data 

## Cell Feature Matrix

In [6]:
adata = sc.read_10x_h5('data/Xenium_V1_mouse_pup_outs/cell_feature_matrix.h5') 
#adata = sc.read('data/Xenium_V1_mouse_pup_outs/adata_filtered.h5ad') 

In [7]:
adata.var = adata.var.rename(columns={"gene_ids":"Ensemble ID"})
#adata.var.reset_index(inplace=True, drop = False, names= "gene_name")
#adata.obs.reset_index(inplace=True, drop=False, names= "cell_id")

## Cell Summary 

In [8]:
df_cells = pd.read_csv("data/Xenium_V1_mouse_pup_outs/cells.csv.gz")

In [9]:
df_cells.cell_id.value_counts()

cell_id
aaaabgpa-1    1
jlcihdnp-1    1
jlcifjco-1    1
jlciffio-1    1
jlcienmn-1    1
             ..
enilhfik-1    1
enilhckc-1    1
enilegml-1    1
enildhge-1    1
oinfkihb-1    1
Name: count, Length: 1355849, dtype: int64

In [10]:
df_adata_obs = df_cells[['cell_id','x_centroid',"y_centroid", "transcript_counts", "total_counts", "cell_area", "nucleus_area"]]

In [11]:
adata.obs = df_adata_obs.copy()

In [12]:
adata.obs

Unnamed: 0,cell_id,x_centroid,y_centroid,transcript_counts,total_counts,cell_area,nucleus_area
0,aaaabgpa-1,850.288391,10613.065430,338,338,235.264071,49.310627
1,aaaabjde-1,837.167175,10610.744141,319,319,246.598290,42.266252
2,aaaacikn-1,835.130371,10622.626953,252,252,152.041099,40.821251
3,aaaackbp-1,823.748047,10620.277344,276,276,232.283758,17.069063
4,aaaacllf-1,842.690186,10630.465820,196,196,153.170006,28.990314
...,...,...,...,...,...,...,...
1355844,oinfhojg-1,9908.918945,19650.691406,85,85,59.064377,15.804688
1355845,oinfhphb-1,9911.258789,19618.589844,106,106,35.402501,22.442657
1355846,oinfieej-1,9911.923828,19655.921875,70,70,32.331876,13.456563
1355847,oinfkcfi-1,9913.118164,19636.181641,64,64,32.106095,11.243907


In [13]:
adata.obs.set_index("cell_id",inplace=True)

In [14]:
del df_adata_obs
del df_cells

## Transcripts

In [15]:
transcripts = pd.read_csv("data/Xenium_V1_mouse_pup_outs/transcripts_filtered_gut.csv")

In [16]:
transcripts["cell_mapped"] = transcripts["cell_id"].apply(lambda x: "unassigned" if x == "UNASSIGNED" else "assigned")

In [17]:
tr = transcripts[transcripts["cell_id"] != "UNASSIGNED" ]
tr.reset_index(inplace=True, drop=True)
tr = tr.drop(columns=["Unnamed: 0"])

In [18]:
tr

Unnamed: 0,transcript_id,cell_id,overlaps_nucleus,feature_name,x_location,y_location,z_location,qv,fov_name,nucleus_distance,cell_mapped
0,282179351358717,cjceeepn-1,0,Fibin,4848.1240,4498.7305,21.928871,40.000000,G8,0.584109,assigned
1,282179351417454,cjceeepn-1,0,Myoz2,4844.3447,4486.2490,20.381886,40.000000,G8,12.603382,assigned
2,282179351417474,cjceeepn-1,0,Myoz2,4845.1177,4487.4224,20.421465,40.000000,G8,11.202755,assigned
3,282179351569878,cjceeepn-1,1,Upk3b,4849.0360,4498.8696,22.161554,16.672361,G8,0.000000,assigned
4,282179351641159,cjceeepn-1,0,Calcrl,4846.2040,4488.2515,20.463503,40.000000,G8,9.954653,assigned
...,...,...,...,...,...,...,...,...,...,...,...
2318953,282192237251365,apnmcbdl-1,0,Myl9,6215.2710,5075.3950,21.892443,16.658100,H11,1.935795,assigned
2318954,282192237251366,liocdlff-1,0,Myl9,6215.2800,5055.4795,24.919008,27.758255,H11,1.933230,assigned
2318955,282192237251384,apnmeebl-1,1,Pck1,6216.5600,5071.6494,27.098642,7.151604,H11,0.000000,assigned
2318956,282192237251387,apnmeebl-1,0,Chodl,6216.8000,5076.4805,23.685452,40.000000,H11,3.255371,assigned


In [19]:
tr_both = tr.groupby('cell_id')['overlaps_nucleus'].agg(
    takes_0 = lambda x: 1 if 0 in x.values else 0,
    takes_1 = lambda x: 1 if 1 in x.values else 0,
    takes_both=lambda x: 1 if {0, 1}.issubset(x.values) else 0
).reset_index()

tr = tr.merge(tr_both, on="cell_id", how="left" )
tr = tr[tr["takes_both"]==1]


In [21]:
tr_both.takes_1.value_counts()

takes_1
1    9683
0     107
Name: count, dtype: int64

In [20]:
tr_both.takes_both.value_counts()

takes_both
1    9681
0     109
Name: count, dtype: int64

In [22]:
tr = tr[tr["takes_both"]==1]

In [23]:
tr = tr.drop(columns=["takes_0","takes_1","takes_both"])
tr = tr[tr["feature_name"]!="Fam25c"]

In [25]:
adata = adata[:,adata.var.index!="Fam25c"]

In [26]:
adata

View of AnnData object with n_obs × n_vars = 1355849 × 378
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'total_counts', 'cell_area', 'nucleus_area'
    var: 'Ensemble ID', 'feature_types', 'genome'

In [27]:
# divide data in nuclei and cytopasm
tn = tr[tr['overlaps_nucleus'] == 1]
tc = tr[tr['overlaps_nucleus'] == 0]

# create the cellxgene matrix for nuc counts and cytoplasmic
nuc = pd.crosstab(tn['cell_id'],tn['feature_name'])
cyt = pd.crosstab(tc['cell_id'],tc['feature_name'])

In [28]:
#get the cells which has a transcripts mappep to it 
adata = adata[adata.obs.index.isin(nuc.index)]
adata = adata[adata.obs.index.isin(cyt.index)]

In [29]:
adata

View of AnnData object with n_obs × n_vars = 9681 × 378
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'total_counts', 'cell_area', 'nucleus_area'
    var: 'Ensemble ID', 'feature_types', 'genome'

In [30]:
# create "spliced", "unspliced" layers since scvelo looks for them
nucsort = nuc.loc[adata.obs.index,adata.var.index]
cytsort = cyt.loc[adata.obs.index,adata.var.index]

In [33]:
adata.layers['spliced'] = np.array(cytsort)
adata.layers['unspliced'] = np.array(nucsort)

  adata.layers['spliced'] = np.array(cytsort)


In [34]:
adata.layers

Layers with keys: spliced, unspliced

In [35]:
adata

AnnData object with n_obs × n_vars = 9681 × 378
    obs: 'x_centroid', 'y_centroid', 'transcript_counts', 'total_counts', 'cell_area', 'nucleus_area'
    var: 'Ensemble ID', 'feature_types', 'genome'
    layers: 'spliced', 'unspliced'

In [37]:
adata.uns['spots'] = tr

In [38]:
adata.obsm["spatial"] = adata.obs[["x_centroid", "y_centroid"]].copy().to_numpy()

In [41]:
adata.write("data/Xenium_V1_mouse_pup_outs/adata_gut_transcripts.h5ad")