In [2]:
!pip install anndata

Collecting anndata
  Downloading anndata-0.7.8-py3-none-any.whl (91 kB)
[K     |████████████████████████████████| 91 kB 13.9 MB/s eta 0:00:01
Collecting natsort
  Downloading natsort-8.0.2-py3-none-any.whl (37 kB)
Collecting xlrd<2.0
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 48.0 MB/s eta 0:00:01
Installing collected packages: xlrd, natsort, anndata
Successfully installed anndata-0.7.8 natsort-8.0.2 xlrd-1.2.0
You should consider upgrading via the '/home/roko/.cache/pypoetry/virtualenvs/spatial-G_n0JvVf-py3.8/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
%load_ext autoreload
%autoreload 2
import h5py
import anndata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import scipy.sparse.linalg
rng=np.random.default_rng()
import tqdm.notebook
import pickle
%matplotlib inline
import sys
import ipywidgets
import sklearn.neighbors

original_url= "https://datadryad.org/stash/downloads/file_stream/67671"
csv_location='/data/spatial/moffit_merfish/original_file.csv'
h5ad_location='/data/spatial/moffit_merfish/original_file.h5ad'
connectivity_matrix_template='/data/spatial/moffit_merfish/connectivity_%dneighbors.h5ad'
genetypes_location='/data/spatial/moffit_merfish/genetypes.pkl'

# download csv

# munge into hdf5 file

# supplement hdf5 file with a column indicating "tissue id" for each cell

# create global graph (using 3 nearest neigbors)

# write down ligand/receptor sets

# run a simple experiment: use ligands and receptors to predict response genes in excitatory cells, with a linear model

In [2]:
# load data
nneigh=3
ad=anndata.read_h5ad(h5ad_location)
connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%nneigh).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

with open(genetypes_location,'rb') as f:
    genetypes=pickle.load(f)

In [3]:
# onehot encode cell classes
def oh_encode(lst):
    lst=np.array(lst)
    group_names=np.unique(lst)
    group_indexes=np.zeros((len(lst),len(group_names)),dtype=bool)
    for i,nm in enumerate(group_names):
        group_indexes[lst==nm,i]=True
    return group_names,group_indexes
cell_classes,cell_class_onehots=oh_encode(ad.obs['Cell_class'])

In [4]:
# a function to construct a prediction problem for a subset of cells

def construct_problem(mask,target_gene,neighbor_genes,self_genes):
    '''
    mask -- set of cells
    target_gene -- gene to predict
    neighbor_genes -- names of genes which will be read from neighbors
    self_genes -- names of genes which will be read from target cell
    '''
    
    # load subset of data relevant to mask
    local_processed_expression=np.log1p(ad.X[mask].astype(float)) # get expression on subset of cells
    local_edges=connectivity_matrix[mask][:,mask]   # get edges for subset
    
    selfset_idxs=[gene_lookup[x] for x in self_genes] # collect the column indexes associated with them
    selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
       
    neighborset_idxs=[gene_lookup[x] for x in neighbor_genes] # collect the column indexes associated with them
    neighset_exprs = local_processed_expression[:,neighborset_idxs] # collect ligand and receptor expressions
        
    n_neighs=(local_edges@np.ones(local_edges.shape[0]))
    neigh_avgs = (local_edges@neighset_exprs) / n_neighs[:,None] # average ligand/receptor for neighbors
    
    neigh_cellclass_avgs = (local_edges@cell_class_onehots[mask]) / n_neighs[:,None] # celltype simplex
    
    positions=np.array(ad.obs[['Centroid_X','Centroid_Y','Bregma']])[mask] # get positions
    
    covariates=np.c_[selfset_exprs,neigh_avgs,neigh_cellclass_avgs,positions] # collect all covariates
    predict = local_processed_expression[:,gene_lookup[target_gene]] # collect what we're supposed to predict
    
    excites=(ad.obs['Cell_class']=='Excitatory')[mask] # get the subset of these cells which are excitatory
    covariates=covariates[excites] # subset to excites
    predict=predict[excites]       # subset to excites
    
    
    return covariates,predict

In [5]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

trainX,trainY=construct_problem((ad.obs['Animal_ID']>=2)&(ad.obs['Animal_ID']<=4),'Ace2',neighset,oset)
testX,testY=construct_problem((ad.obs['Animal_ID']==1),'Ace2',neighset,oset)

print(trainX.shape,trainY.shape)
print(testX.shape,testY.shape)

# whiten covariates
# mu=np.mean(trainX,axis=0)
# sig=np.std(trainX,axis=0)
# trainX=(trainX-mu)/sig
# testX=(testX-mu)/sig

(19855, 121) (19855,)
(11757, 121) (11757,)


In [6]:
neighset

array(['Cbln1', 'Cxcl14', 'Cbln2', 'Vgf', 'Scg2', 'Cartpt', 'Tac2',
       'Bdnf', 'Bmp7', 'Cyr61', 'Fn1', 'Fst', 'Gad1', 'Ntng1', 'Pnoc',
       'Selplg', 'Sema3c', 'Sema4d', 'Serpine1', 'Adcyap1', 'Cck', 'Crh',
       'Gal', 'Gnrh1', 'Nts', 'Oxt', 'Penk', 'Sst', 'Tac1', 'Trh', 'Ucn3'],
      dtype='<U8')

In [7]:
model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

0.17421921228055648

In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
model=HistGradientBoostingRegressor(loss="absolute_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=10000, n_iter_no_change=100)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

Binning 0.017 GB of training data: 0.368 s
Binning 0.002 GB of validation data: 0.012 s
Fitting gradient boosted rounds:
[1/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.008s
[2/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.008s
[3/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[4/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.008s
[5/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[6/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[7/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[8/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[9/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[10/10000] 1 tree, 2 leaves, max depth = 1

[87/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[88/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[89/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[90/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[91/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[92/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[93/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[94/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[95/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[96/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10933, val loss: 0.11145, in 0.007s
[97/10000] 1 tree, 2 leaves, max depth = 1, train loss: 0.10

0.10130625125773991

### Same 3 cells as above but w/ standardizing this time.

In [13]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

trainX,trainY=construct_problem((ad.obs['Animal_ID']>=2)&(ad.obs['Animal_ID']<=4),'Ace2',neighset,oset)
testX,testY=construct_problem((ad.obs['Animal_ID']==1),'Ace2',neighset,oset)

print(trainX.shape,trainY.shape)
print(testX.shape,testY.shape)

# whiten covariates
mu=np.mean(trainX,axis=0)
sig=np.std(trainX,axis=0)
trainX=(trainX-mu)/sig
testX=(testX-mu)/sig

(19855, 121) (19855,)
(11757, 121) (11757,)


In [14]:
model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

0.17420094926196253

In [16]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
model=HistGradientBoostingRegressor(loss="absolute_error")
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

0.10130625125773991

### Comparison to Standard Scalar

In [36]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

trainX,trainY=construct_problem((ad.obs['Animal_ID']>=2)&(ad.obs['Animal_ID']<=4),'Ace2',neighset,oset)
testX,testY=construct_problem((ad.obs['Animal_ID']==1),'Ace2',neighset,oset)

mu=np.mean(trainX,axis=0)
sig=np.std(trainX,axis=0)
trainX_Jackson=(trainX-mu)/sig
testX_Jackson=(testX-mu)/sig

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(trainX)
trainX_Roman = scaler.transform(trainX)