In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as scp
import collections
import h5py

# Kidney/Liver SLRRP scoring

In [6]:
liv_periph = pd.read_csv('../outputs/gene_cell/liver_peripheral.csv')
liv_periph.head()

Unnamed: 0,metric,cell_id,annotation,num_spots,gene,num_gene_spots,median_rank,score,variance
0,peripheral,100038311040592126253478462795232776943,pericentral hepatocyte,130,153.0,9,72.0,-0.100775,0.086593
1,peripheral,100038311040592126253478462795232776943,pericentral hepatocyte,130,126.0,11,110.0,-0.689922,0.07206
2,peripheral,100038311040592126253478462795232776943,pericentral hepatocyte,130,154.0,12,37.5,0.434109,0.061247
3,peripheral,100038311040592126253478462795232776943,pericentral hepatocyte,130,175.0,1,116.0,-0.782946,0.338501
4,peripheral,100038311040592126253478462795232776943,pericentral hepatocyte,130,149.0,13,90.0,-0.379845,0.061403


In [7]:
liv_periph.shape

(865936, 9)

In [8]:
liv_periph['cell_id'].nunique()

16175

In [10]:
liv_periph['annotation'].value_counts()

periportal hepatocyte           268697
pericentral hepatocyte          256994
unannotated                     103847
Kupffer cell                     55943
other endothelial cell           48935
pericentral endothelial cell     42825
hepatic stellate cell            29763
other hepatocyte                 26709
periportal endothelial cell      16504
NK cell                          15355
myeloid leukocyte                  364
Name: annotation, dtype: int64

In [13]:
liv_periph['gene'].nunique()

307

# I've decided I'm going to use the scANVI cell-type annotations

I can make multiple files with multiple annotations, but scANVI and scVI seem to be very similar and also have more similar annotations to Tabula Muris which I'll need to compare to with ReadZs

I'm going to open the SLRRP-formatted hdf5's and re-annotate the cells from the h5ad objects

In [80]:
kidney_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scANVI_kidney_object.h5ad')
kidney_anndata

AnnData object with n_obs × n_vars = 34387 × 306
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'n_counts', 'fov', 'volume', 'center_x', 'center_y', 'min_x', 'max_x', 'min_y', 'max_y', 'median average DAPI', 'mean average DAPI', 'dataset', 'barcodeCount', 'tech', 'celltype_scanvi', '_scvi_batch', '_scvi_labels', 'predicted_cell_type'
    var: 'n_cells-scRNA-seq'
    uns: 'C_scANVI_colors', '_scvi', 'neighbors', 'tech_colors', 'umap'
    obsm: 'X_scANVI', 'X_umap', 'spatial'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [81]:
onts = kidney_anndata.obs['predicted_cell_type'].values
cell_ids = kidney_anndata.obs['predicted_cell_type'].index.str.split('-').str[0]
cell_id_to_ont = {c:o for c,o in zip(cell_ids,onts)}
len(cell_id_to_ont)

34387

In [82]:
f = h5py.File('../inputs/kidney_111921_gv_filt.hdf5','r')

#how many cells in the hdf5 have an ont mapping
num_cells_mapping = sum(1 for cell_id in f['cell_ids'] if cell_id.decode() in cell_id_to_ont)
print(num_cells_mapping)

f.close()

31253


In [83]:
#no cells in this other kidney file have an ontology mapping
f = h5py.File('../inputs/kidney_121021_gv_filt.hdf5','r')

#how many cells in the hdf5 have an ont mapping
num_cells_mapping = sum(1 for cell_id in f['cell_ids'] if cell_id.decode() in cell_id_to_ont)
print(num_cells_mapping)

f.close()

0


In [92]:
#iterate annotating all cells
f = h5py.File('../inputs/kidney_111921_gv_filt.hdf5','a')

for cell_id in f['cell_ids']:
    cell_id = cell_id.decode()
    ont = cell_id_to_ont[cell_id] if cell_id in cell_id_to_ont else 'unannotated'
    f['cells'][cell_id].attrs['annotation'] = ont


f.close()

In [93]:
#now doing the same thing for the liver
liver_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scANVI_liver_object.h5ad')
onts = liver_anndata.obs['predicted_cell_type'].values
cell_ids = liver_anndata.obs['predicted_cell_type'].index.str.split('-').str[0]
cell_id_to_ont = {c:o for c,o in zip(cell_ids,onts)}

f = h5py.File('../inputs/liver_gv_filt.hdf5','a')

used_onts = collections.Counter()

for cell_id in f['cell_ids']:
    cell_id = cell_id.decode()
    ont = cell_id_to_ont[cell_id] if cell_id in cell_id_to_ont else 'unannotated'
    used_onts[ont] += 1
    f['cells'][cell_id].attrs['annotation'] = ont


f.close()

In [94]:
used_onts

Counter({'Kupffer cell': 1060,
         'NK cell': 276,
         'hepatic stellate cell': 541,
         'myeloid leukocyte': 7,
         'other endothelial cell': 870,
         'other hepatocyte': 441,
         'pericentral endothelial cell': 760,
         'pericentral hepatocyte': 4892,
         'periportal endothelial cell': 273,
         'periportal hepatocyte': 5325,
         'unannotated': 1730})

# Debugging SLRRM issue in scoring (how did this issue arrise?)

In [1]:
import sys
sys.path.append('../src')
import sprawl
from sprawl import scoring

import importlib
importlib.reload(sprawl)

sample = sprawl.HDF5('../inputs/liver_gv_filt.hdf5')

cell_ids = [
    '100005199661550619447465419803033850982',
    '100038311040592126253478462795232776943',
    '100053577078057646947687333568984053913',
    '100099503537996968024320038623424846280',
    '100129014936507204072415930555467950524',
    '100146770389838818636969302040055032402',
    '100162920576728168099408864835859286684',
    '100202832352906207372119570941630619327',
    '100328528037744692615065366839920065024',
    '100329676076613869680699081068056965514'
]

cells = sample.get_cells_by_id(cell_ids)
cells[0]

Cell-100005199661550619447465419803033850982-pericentral hepatocyte

In [2]:
cells[0].n

314

In [None]:
cell.med_

In [31]:
x = list(scoring._iter_scores([cells[0]],'peripheral'))
c = x[0]
c

Cell-100005199661550619447465419803033850982-pericentral hepatocyte

In [25]:
cells[0].gene_med_ranks

{}

In [3]:
list(sprawl.iter_scores([cells[0]], metric='peripheral'))

woo 314
hello
157.5
157.5 112.0
woo 314
hello
157.5
157.5 106.5
woo 314
hello
157.5
157.5 141.0
woo 314
hello
157.5
157.5 178.0
woo 314
hello
157.5
157.5 77.0
woo 314
hello
157.5
157.5 231.0
woo 314
hello
157.5
157.5 142.0
woo 314
hello
157.5
157.5 94.0
woo 314
hello
157.5
157.5 150.0
woo 314
hello
157.5
157.5 230.0
woo 314
hello
157.5
157.5 112.5
woo 314
hello
157.5
157.5 285.0
woo 314
hello
157.5
157.5 308.0
woo 314
hello
157.5
157.5 257.0
woo 314
hello
157.5
157.5 195.0
woo 314
hello
157.5
157.5 273.0
woo 314
hello
157.5
157.5 138.0
woo 314
hello
157.5
157.5 147.0
woo 314
hello
157.5
157.5 47.0
woo 314
hello
157.5
157.5 194.0
woo 314
hello
157.5
157.5 207.5
woo 314
hello
157.5
157.5 215.0
woo 314
hello
157.5
157.5 226.5
woo 314
hello
157.5
157.5 143.0
woo 314
hello
157.5
157.5 245.0
woo 314
hello
157.5
157.5 8.0
woo 314
hello
157.5
157.5 208.0
woo 314
hello
157.5
157.5 262.0
woo 314
hello
157.5
157.5 176.0
woo 314
hello
157.5
157.5 264.0
woo 314
hello
157.5
157.5 294.0
woo 314
hello

[        metric                                  cell_id  \
 0   peripheral  100005199661550619447465419803033850982   
 1   peripheral  100005199661550619447465419803033850982   
 2   peripheral  100005199661550619447465419803033850982   
 3   peripheral  100005199661550619447465419803033850982   
 4   peripheral  100005199661550619447465419803033850982   
 5   peripheral  100005199661550619447465419803033850982   
 6   peripheral  100005199661550619447465419803033850982   
 7   peripheral  100005199661550619447465419803033850982   
 8   peripheral  100005199661550619447465419803033850982   
 9   peripheral  100005199661550619447465419803033850982   
 10  peripheral  100005199661550619447465419803033850982   
 11  peripheral  100005199661550619447465419803033850982   
 12  peripheral  100005199661550619447465419803033850982   
 13  peripheral  100005199661550619447465419803033850982   
 14  peripheral  100005199661550619447465419803033850982   
 15  peripheral  10000519966155061944746

In [113]:
sample.get_cells_by_id?

In [107]:
sample.get_cells_by_id?

In [101]:
f['cell_ids'][:10]

array([b'100005199661550619447465419803033850982',
       b'100038311040592126253478462795232776943',
       b'100053577078057646947687333568984053913',
       b'100099503537996968024320038623424846280',
       b'100129014936507204072415930555467950524',
       b'100146770389838818636969302040055032402',
       b'100162920576728168099408864835859286684',
       b'100202832352906207372119570941630619327',
       b'100328528037744692615065366839920065024',
       b'100329676076613869680699081068056965514'], dtype='|S39')

# Kidney: mapping from Tabula Muris to vizgen ontologies

In [6]:
muris_meta_df = pd.read_csv('/oak/stanford/groups/horence/Roozbeh/single_cell_project/utility_files/meta_data/Tabula_muris_senis/Tabula_muris_droplet_new.csv')
muris_meta_df.head().T

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,0,1,2,3,4
index,AAACCTGAGTCCTCCT-1-47-1-0,AAACCTGCAGCTATTG-1-47-1-0,AAAGATGAGCAGGCTA-1-47-1-0,AAAGCAAGTCTCTTAT-1-47-1-0,AAAGCAATCCACGAAT-1-47-1-0
age,1m,1m,1m,1m,1m
cell,10X_P5_4_AAACCTGAGTCCTCCT,10X_P5_4_AAACCTGCAGCTATTG,10X_P5_4_AAAGATGAGCAGGCTA,10X_P5_4_AAAGCAAGTCTCTTAT,10X_P5_4_AAAGCAATCCACGAAT
cell_ontology_class,bladder cell,bladder urothelial cell,bladder cell,bladder cell,bladder cell
cell_ontology_id,CL:1001319,CL:1001428,CL:1001319,CL:1001319,CL:1001319
free_annotation,bladder mesenchymal cell (Car3+),luminal bladder epithelial cell (umbrella cell),bladder mesenchymal cell (Car3+),bladder mesenchymal cell (Scara5+),bladder mesenchymal cell (Scara5+)
method,droplet,droplet,droplet,droplet,droplet
mouse.id,1-M-63,1-M-63,1-M-63,1-M-63,1-M-63
n_genes,3301,2236,3670,2699,2742
sex,male,male,male,male,male


In [12]:
muris_meta_df['tissue'].unique()

array(['Bladder', 'Fat', 'Heart_and_Aorta', 'Kidney', 'Large_Intestine',
       'Limb_Muscle', 'Liver', 'Lung', 'Mammary_Gland', 'Marrow',
       'Pancreas', 'Spleen', 'Thymus', 'Tongue', 'Trachea'], dtype=object)

In [34]:
muris_kidney_onts = set(muris_meta_df[
    muris_meta_df['tissue'].eq('Kidney')
]['cell_ontology_class'])

print('There are',len(muris_kidney_onts),'Kidney cell_ontology_classes by Muris')
muris_kidney_onts

There are 18 Kidney cell_ontology_classes by Muris


{'B cell',
 'NK cell',
 'T cell',
 'brush cell',
 'epithelial cell of proximal tubule',
 'fenestrated cell',
 'fibroblast',
 'kidney capillary endothelial cell',
 'kidney collecting duct principal cell',
 'kidney cortex artery cell',
 'kidney distal convoluted tubule epithelial cell',
 'kidney loop of Henle thick ascending limb epithelial cell',
 'kidney mesangial cell',
 'kidney proximal convoluted tubule epithelial cell',
 'lymphocyte',
 'macrophage',
 'plasma cell',
 'podocyte'}

In [45]:
#First trying the MERFISH_Kidney file
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/MERFISH_kidney_object.h5ad')
viz_onts = set(viz_anndata.obs['tentative_annot'].values)
shared_onts = muris_kidney_onts.intersection(viz_onts)

print('{} onts from MERFISH kidney:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

6 onts from MERFISH kidney:
 {'podocyte', 'epithelial cell of proximal tubule', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle epithelial cell', 'endothelial cell', 'kidney collecting duct epithelial cell'}

3 shared onts with Muris:
 {'epithelial cell of proximal tubule', 'kidney distal convoluted tubule epithelial cell', 'podocyte'}


In [48]:
#scANVI
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scANVI_kidney_object.h5ad')
viz_onts = set(viz_anndata.obs['predicted_cell_type'].values)
shared_onts = muris_kidney_onts.intersection(viz_onts)

print('{} onts from scANVI kidney:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

11 onts from scANVI kidney:
 {'lymphocyte', 'fibroblast', 'pericyte', 'glomerular capillary endothelial cell', 'podocyte', 'epithelial cell of proximal tubule', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle epithelial cell', 'macrophage', 'kidney blood vessel cell', 'kidney collecting duct epithelial cell'}

6 shared onts with Muris:
 {'lymphocyte', 'fibroblast', 'kidney distal convoluted tubule epithelial cell', 'epithelial cell of proximal tubule', 'podocyte', 'macrophage'}


In [49]:
#scVI
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scVI_kidney_object.h5ad')
viz_onts = set(viz_anndata.obs['predicted_cell_type'].values)
shared_onts = muris_kidney_onts.intersection(viz_onts)

print('{} onts from scANVI kidney:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

11 onts from scANVI kidney:
 {'lymphocyte', 'fibroblast', 'pericyte', 'glomerular capillary endothelial cell', 'podocyte', 'epithelial cell of proximal tubule', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle epithelial cell', 'macrophage', 'kidney blood vessel cell', 'kidney collecting duct epithelial cell'}

6 shared onts with Muris:
 {'lymphocyte', 'fibroblast', 'kidney distal convoluted tubule epithelial cell', 'epithelial cell of proximal tubule', 'podocyte', 'macrophage'}


# Liver: mapping from Tabula Muris to vizgen ontologies

In [52]:
muris_liver_onts = set(muris_meta_df[
    muris_meta_df['tissue'].eq('Liver')
]['cell_ontology_class'])

print('There are',len(muris_liver_onts),'Liver cell_ontology_classes by Muris')
muris_liver_onts

There are 8 Liver cell_ontology_classes by Muris


{'B cell',
 'Kupffer cell',
 'NK cell',
 'endothelial cell of hepatic sinusoid',
 'hepatic stellate cell',
 'hepatocyte',
 'myeloid leukocyte',
 'plasmacytoid dendritic cell'}

In [53]:
#First trying the MERFISH_Liver file
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/MERFISH_liver_object.h5ad')
viz_onts = set(viz_anndata.obs['tentative_annot'].values)
shared_onts = muris_liver_onts.intersection(viz_onts)

print('{} onts from MERFISH liver:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

7 onts from MERFISH liver:
 {'immune cell', 'periportal endothelial cell', 'hepatic stellate cell', 'pericentral hepatocyte', 'pericentral endothelial cell', 'periportal hepatocyte', 'Kupffer cell'}

2 shared onts with Muris:
 {'Kupffer cell', 'hepatic stellate cell'}


In [54]:
#scANVI
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scANVI_liver_object.h5ad')
viz_onts = set(viz_anndata.obs['predicted_cell_type'].values)
shared_onts = muris_liver_onts.intersection(viz_onts)

print('{} onts from scANVI liver:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

11 onts from scANVI liver:
 {'myeloid leukocyte', 'periportal endothelial cell', 'other hepatocyte', 'hepatic stellate cell', 'NK cell', 'pericentral hepatocyte', 'pericentral endothelial cell', 'periportal hepatocyte', 'plasmacytoid dendritic cell', 'Kupffer cell', 'other endothelial cell'}

5 shared onts with Muris:
 {'myeloid leukocyte', 'hepatic stellate cell', 'NK cell', 'plasmacytoid dendritic cell', 'Kupffer cell'}


In [55]:
#scVI
viz_anndata = scp.read_h5ad('/oak/stanford/groups/horence/rob/isoform_localizations/sprawl/preprocessing/KidneyLiver_preprocessing/data/cell_gene_counts/scVI_liver_object.h5ad')
viz_onts = set(viz_anndata.obs['predicted_cell_type'].values)
shared_onts = muris_liver_onts.intersection(viz_onts)

print('{} onts from scVI liver:\n'.format(len(viz_onts)),viz_onts)
print('')
print('{} shared onts with Muris:\n'.format(len(shared_onts)),shared_onts)

11 onts from scVI liver:
 {'myeloid leukocyte', 'periportal endothelial cell', 'other hepatocyte', 'hepatic stellate cell', 'NK cell', 'pericentral hepatocyte', 'pericentral endothelial cell', 'periportal hepatocyte', 'plasmacytoid dendritic cell', 'Kupffer cell', 'other endothelial cell'}

5 shared onts with Muris:
 {'myeloid leukocyte', 'hepatic stellate cell', 'NK cell', 'plasmacytoid dendritic cell', 'Kupffer cell'}
