In [None]:
!pip install anndata

In [1]:
%load_ext autoreload
%autoreload 2
import json
import h5py
import anndata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import scipy.sparse.linalg
rng=np.random.default_rng()
import tqdm.notebook
import pickle
%matplotlib inline
import sys
import ipywidgets
import sklearn.neighbors
from scipy.sparse import csr_matrix
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
import time
import shap
from sklearn.preprocessing import StandardScaler


original_url= "https://datadryad.org/stash/downloads/file_stream/67671"
csv_location='data/spatial/moffit_merfish/original_file.csv'
h5ad_location='data/spatial/moffit_merfish/original_file.h5ad'
connectivity_matrix_template='data/spatial/moffit_merfish/connectivity_%d%s.h5ad'
genetypes_location='data/spatial/moffit_merfish/genetypes.pkl'



# download csv

In [None]:
import requests

with open(csv_location, "wb") as csvf:
    csvf.write(requests.get(original_url).content)

# munge into hdf5 file

In [None]:
dataframe = pd.read_csv(csv_location)

dct={}
for colnm, dtype in zip(dataframe.keys()[:9], dataframe.dtypes[:9]):
    if dtype.kind == "O":
        dct[colnm]=np.require(dataframe[colnm], dtype="U36")
    else:
        dct[colnm]=np.require(dataframe[colnm])
# change expression here to make it synthetic
expression = np.array(dataframe[dataframe.keys()[9:]]).astype(np.float16)
print(expression)
gene_names = np.array(dataframe.keys()[9:], dtype="U80")
cellid=dct.pop('Cell_ID')

ad=anndata.AnnData(
    X=expression,
    var=pd.DataFrame(index=gene_names),
    obs=pd.DataFrame(dct,index=cellid)
)

ad.write_h5ad(h5ad_location)

# supplement hdf5 file with a column indicating "tissue id" for each cell

In [None]:
ad=anndata.read_h5ad(h5ad_location)
animal_ids=np.unique(ad.obs['Animal_ID'])
bregmas=np.unique(ad.obs['Bregma'])
tissue_id=np.zeros(len(ad),dtype=int)
n_tissues=0
    
for aid in animal_ids:
    for bregma in bregmas:
        good=(ad.obs['Animal_ID']==aid)&(ad.obs['Bregma']==bregma)
        if np.sum(good)>0:
            tissue_id[good]=n_tissues
            n_tissues+=1
ad.obs['Tissue_ID']=tissue_id
ad.write_h5ad(h5ad_location)

# create global graph 

In [None]:
ad=anndata.read_h5ad(h5ad_location)
row=np.zeros(0,dtype=int)
col=np.zeros(0,dtype=int)
nneigh=10
radius=0
mode="rad"

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col=np.r_[col,idxs[E.tocoo().col]]
        row=np.r_[row,idxs[E.tocoo().row]]
    if mode == "rad":
        p=sp.spatial.cKDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges=p.query_pairs(r=radius)
        col=np.r_[col,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row=np.r_[row,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col)),(row,col)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))

# write down ligand/receptor sets

In [None]:
ligands=np.array(['Cbln1', 'Cxcl14', 'Cbln2', 'Vgf', 'Scg2', 'Cartpt', 'Tac2',
       'Bdnf', 'Bmp7', 'Cyr61', 'Fn1', 'Fst', 'Gad1', 'Ntng1', 'Pnoc',
       'Selplg', 'Sema3c', 'Sema4d', 'Serpine1', 'Adcyap1', 'Cck', 'Crh',
       'Gal', 'Gnrh1', 'Nts', 'Oxt', 'Penk', 'Sst', 'Tac1', 'Trh', 'Ucn3'])

receptors=np.array(['Crhbp', 'Gabra1', 'Gpr165', 'Glra3', 'Gabrg1', 'Adora2a',
       'Avpr1a', 'Avpr2', 'Brs3', 'Calcr', 'Cckar', 'Cckbr', 'Crhr1',
       'Crhr2', 'Galr1', 'Galr2', 'Grpr', 'Htr2c', 'Igf1r', 'Igf2r',
       'Kiss1r', 'Lepr', 'Lpar1', 'Mc4r', 'Npy1r', 'Npy2r', 'Ntsr1',
       'Oprd1', 'Oprk1', 'Oprl1', 'Oxtr', 'Pdgfra', 'Prlr', 'Ramp3',
       'Rxfp1', 'Slc17a7', 'Slc18a2', 'Tacr1', 'Tacr3', 'Trhr'])

response_genes=np.array(['Ace2', 'Aldh1l1', 'Amigo2', 'Ano3', 'Aqp4', 'Ar', 'Arhgap36',
       'Baiap2', 'Ccnd2', 'Cd24a', 'Cdkn1a', 'Cenpe', 'Chat', 'Coch',
       'Col25a1', 'Cplx3', 'Cpne5', 'Creb3l1', 'Cspg5', 'Cyp19a1',
       'Cyp26a1', 'Dgkk', 'Ebf3', 'Egr2', 'Ermn', 'Esr1', 'Etv1',
       'Fbxw13', 'Fezf1', 'Gbx2', 'Gda', 'Gem', 'Gjc3', 'Greb1',
       'Irs4', 'Isl1', 'Klf4', 'Krt90', 'Lmod1', 'Man1a', 'Mbp', 'Mki67',
       'Mlc1', 'Myh11', 'Ndnf', 'Ndrg1', 'Necab1', 'Nnat', 'Nos1',
       'Npas1', 'Nup62cl', 'Omp', 'Onecut2', 'Opalin', 'Pak3', 'Pcdh11x',
       'Pgr', 'Plin3', 'Pou3f2', 'Rgs2', 'Rgs5', 'Rnd3', 'Scgn',
       'Serpinb1b', 'Sgk1', 'Slc15a3', 'Slc17a6', 'Slc17a8', 'Slco1a4',
       'Sln', 'Sox4', 'Sox6', 'Sox8', 'Sp9', 'Synpr', 'Syt2', 'Syt4',
       'Sytl4', 'Th', 'Tiparp', 'Tmem108', 'Traf4', 'Ttn', 'Ttyh2'])
cell_types = [
        "Ambiguous",
        "Astrocyte",
        "Endothelial 1",
        "Endothelial 2",
        "Endothelial 3",
        "Ependymal",
        "Excitatory",
        "Inhibitory",
        "Microglia",
        "OD Immature 1",
        "OD Immature 2",
        "OD Mature 1",
        "OD Mature 2",
        "OD Mature 3",
        "OD Mature 4",
        "Pericytes",
    ]

In [None]:
with open(genetypes_location,'wb') as f:
    pickle.dump(dict(ligands=ligands,receptors=receptors,response_genes=response_genes),f)

# run a simple experiment: use ligands and receptors to predict response genes in excitatory cells, with a linear model

In [None]:
# load data
# These are set above. You can change these here if you want though.
# radius=60
# mode="rad"
ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
     connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

In [None]:
with open(genetypes_location,'rb') as f:
    genetypes=pickle.load(f)

In [None]:
# onehot encode cell classes
def oh_encode(lst):
    lst=np.array(lst)
    group_names=np.unique(lst)
    group_indexes=np.zeros((len(lst),len(group_names)),dtype=bool)
    for i,nm in enumerate(group_names):
        group_indexes[lst==nm,i]=True
    return group_names,group_indexes
cell_classes,cell_class_onehots=oh_encode(ad.obs['Cell_class'])

In [None]:
# a function to construct a prediction problem for a subset of cells

def construct_problem(mask,target_gene,neighbor_genes,self_genes,filter_excitatory=False):
    '''
    mask -- set of cells
    target_gene -- gene to predict
    neighbor_genes -- names of genes which will be read from neighbors
    self_genes -- names of genes which will be read from target cell
    '''
    
    feature_names = []
    
    # load subset of data relevant to mask
    local_processed_expression=np.log1p(ad.X[mask].astype(float)) # get expression on subset of cells
    local_edges=connectivity_matrix[mask][:,mask]   # get edges for subset
    
    selfset_idxs=[gene_lookup[x] for x in self_genes] # collect the column indexes associated with them
    selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
    
    feature_names += [x for x in self_genes]
    
    neighborset_idxs=[gene_lookup[x] for x in neighbor_genes] # collect the column indexes associated with them
    neighset_exprs = local_processed_expression[:,neighborset_idxs] # collect ligand and receptor expressions
    
    feature_names += [x + " from Neighbors" for x in neighbor_genes]
    
    n_neighs=(local_edges@np.ones(local_edges.shape[0]))
    # print(local_edges)
    # print(n_neighs)
    neigh_avgs = (local_edges@neighset_exprs) / n_neighs[:,None] # average ligand/receptor for neighbors
    
    neigh_cellclass_avgs = (local_edges@cell_class_onehots[mask]) / n_neighs[:,None] # celltype simplex
    
    feature_names += [f"Cell Class {cell_types[x]}" for x in range(16)]
    
    positions=np.array(ad.obs[['Centroid_X','Centroid_Y','Bregma']])[mask] # get positions
    
    feature_names += ['Centroid_X','Centroid_Y','Bregma']
    
    covariates=np.c_[selfset_exprs,neigh_avgs,neigh_cellclass_avgs,positions] # collect all covariates
    predict = local_processed_expression[:,gene_lookup[target_gene]] # collect what we're supposed to predict
    
    # print(selfset_exprs.shape, neigh_avgs.shape, neigh_cellclass_avgs.shape, positions.shape)
    
    if filter_excitatory:
    
        excites=(ad.obs['Cell_class']=='Excitatory')[mask] # get the subset of these cells which are excitatory
        covariates=covariates[excites] # subset to excites
        predict=predict[excites]       # subset to excites
    
    return covariates,predict,feature_names

In [None]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

In [None]:
# oset=[]
# neighset=[]

trainX,trainY,feature_names=construct_problem((ad.obs['Animal_ID']>=2)&(ad.obs['Animal_ID']<=4),'Th',neighset,oset)
testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']==1),'Th',neighset,oset)

print(trainX.shape,trainY.shape)
print(testX.shape,testY.shape)

# whiten covariates
# mu=np.mean(trainX,axis=0)
# sig=np.std(trainX,axis=0)
# trainX=(trainX-mu)/sig
# testX=(testX-mu)/sig

In [None]:
np.max(trainX)

In [None]:
model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=32, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
df = pd.DataFrame(testX, columns=feature_names)
shap_values = shap.LinearExplainer(model, trainX).shap_values(df)
shap.summary_plot(shap_values, df, show=False)
plt.title("OOF")
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100
plt.savefig('scratch.png', bbox_inches="tight")

### Same 3 cells as above but w/ standardizing this time.

In [None]:
response_genes=['Ace2', 'Aldh1l1', 'Amigo2', 'Ano3', 'Aqp4', 'Ar', 'Arhgap36',
       'Baiap2', 'Ccnd2', 'Cd24a', 'Cdkn1a', 'Cenpe', 'Chat', 'Coch',
       'Col25a1', 'Cplx3', 'Cpne5', 'Creb3l1', 'Cspg5', 'Cyp19a1',
       'Cyp26a1', 'Dgkk', 'Ebf3', 'Egr2', 'Ermn', 'Esr1', 'Etv1',
       'Fbxw13', 'Fezf1', 'Gbx2', 'Gda', 'Gem', 'Gjc3', 'Greb1',
       'Irs4', 'Isl1', 'Klf4', 'Krt90', 'Lmod1', 'Man1a', 'Mbp', 'Mki67',
       'Mlc1', 'Myh11', 'Ndnf', 'Ndrg1', 'Necab1', 'Nnat', 'Nos1',
       'Npas1', 'Nup62cl', 'Omp', 'Onecut2', 'Opalin', 'Pak3', 'Pcdh11x',
       'Pgr', 'Plin3', 'Pou3f2', 'Rgs2', 'Rgs5', 'Rnd3', 'Scgn',
       'Serpinb1b', 'Sgk1', 'Slc15a3', 'Slc17a6', 'Slc17a8', 'Slco1a4',
       'Sln', 'Sox4', 'Sox6', 'Sox8', 'Sp9', 'Synpr', 'Syt2', 'Syt4',
       'Sytl4', 'Th', 'Tiparp', 'Tmem108', 'Traf4', 'Ttn', 'Ttyh2']

all_MAEs = []

time_dict = {}
L1_loss_dict = {}

for animal in [1,2,3,4]:
    start = time.time()
    MAE_list = []
    for target_gene in response_genes:
        neighset=genetypes['ligands']
        oset=np.r_[genetypes['ligands'],genetypes['receptors']]
        # oset=neighset

        # oset=[]
        # neighset=[]
        
        train_animals = [1,2,3,4]
        train_animals.remove(animal)
        print(train_animals)
        # FIX THIS SO THAT ONLY FIRST 4 ANIMALS GET USED
        trainX,trainY,feature_names=construct_problem((ad.obs['Animal_ID']!=animal)&(ad.obs['Animal_ID']<=4),target_gene,neighset,oset)
        testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']==animal),target_gene,neighset,oset)

        print(trainX.shape,trainY.shape)
        print(testX.shape,testY.shape)

        # whiten covariates
        mu=np.mean(trainX,axis=0)
        sig=np.std(trainX,axis=0)
        trainX=(trainX-mu)/sig
        testX=(testX-mu)/sig

        model=HistGradientBoostingRegressor(loss="absolute_error")
        model.fit(trainX,trainY)
        MAE_list.append(np.mean(np.abs(model.predict(testX)-testY)))

    end = time.time()
    time_dict[f"Female_Naive_{animal}"] = end-start
    L1_loss_dict[f"Female_Naive_{animal}"] = float(np.mean(MAE_list))

    with open("XGBoost_L1_time.json", "w") as outfile:
        json.dump(time_dict, outfile, indent=4)

    with open("XGBoost_L1_MAE.json", "w") as outfile:
        json.dump(L1_loss_dict, outfile, indent=4)
    
    all_MAEs.append(np.mean(MAE_list))
    
print(np.mean(all_MAEs))

In [None]:
response_genes=['Ace2', 'Aldh1l1', 'Amigo2', 'Ano3', 'Aqp4', 'Ar', 'Arhgap36',
       'Baiap2', 'Ccnd2', 'Cd24a', 'Cdkn1a', 'Cenpe', 'Chat', 'Coch',
       'Col25a1', 'Cplx3', 'Cpne5', 'Creb3l1', 'Cspg5', 'Cyp19a1',
       'Cyp26a1', 'Dgkk', 'Ebf3', 'Egr2', 'Ermn', 'Esr1', 'Etv1',
       'Fbxw13', 'Fezf1', 'Gbx2', 'Gda', 'Gem', 'Gjc3', 'Greb1',
       'Irs4', 'Isl1', 'Klf4', 'Krt90', 'Lmod1', 'Man1a', 'Mbp', 'Mki67',
       'Mlc1', 'Myh11', 'Ndnf', 'Ndrg1', 'Necab1', 'Nnat', 'Nos1',
       'Npas1', 'Nup62cl', 'Omp', 'Onecut2', 'Opalin', 'Pak3', 'Pcdh11x',
       'Pgr', 'Plin3', 'Pou3f2', 'Rgs2', 'Rgs5', 'Rnd3', 'Scgn',
       'Serpinb1b', 'Sgk1', 'Slc15a3', 'Slc17a6', 'Slc17a8', 'Slco1a4',
       'Sln', 'Sox4', 'Sox6', 'Sox8', 'Sp9', 'Synpr', 'Syt2', 'Syt4',
       'Sytl4', 'Th', 'Tiparp', 'Tmem108', 'Traf4', 'Ttn', 'Ttyh2']

all_MAEs = []

time_dict = {}
L1_loss_dict = {}

for animal in [1,2,3,4]:
    start = time.time()
    MAE_list = []
    for target_gene in response_genes:
        neighset=genetypes['ligands']
        oset=np.r_[genetypes['ligands'],genetypes['receptors']]
        # oset=neighset

        # oset=[]
        # neighset=[]
        
        train_animals = [1,2,3,4]
        train_animals.remove(animal)
        print(train_animals)
        # FIX THIS SO THAT ONLY FIRST 4 ANIMALS GET USED
        trainX,trainY=construct_problem((ad.obs['Animal_ID']!=animal)&(ad.obs['Animal_ID']<=4),target_gene,neighset,oset,True)
        testX,testY=construct_problem((ad.obs['Animal_ID']==animal),target_gene,neighset,oset,True)

        print(trainX.shape,trainY.shape)
        print(testX.shape,testY.shape)

        # whiten covariates
        mu=np.mean(trainX,axis=0)
        sig=np.std(trainX,axis=0)
        trainX=(trainX-mu)/sig
        testX=(testX-mu)/sig

        model=HistGradientBoostingRegressor(loss="absolute_error")
        model.fit(trainX,trainY)
        MAE_list.append(np.mean(np.abs(model.predict(testX)-testY)))

    end = time.time()
    time_dict[f"Female_Naive_{animal}"] = end-start
    L1_loss_dict[f"Female_Naive_{animal}"] = float(np.mean(MAE_list))

    with open("XGBoost_L1_time_excitatory.json", "w") as outfile:
        json.dump(time_dict, outfile, indent=4)

    with open("XGBoost_L1_MAE_excitatory.json", "w") as outfile:
        json.dump(L1_loss_dict, outfile, indent=4)
    
    all_MAEs.append(np.mean(MAE_list))
    
print(np.mean(all_MAEs))

In [None]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

trainX,trainY,feature_names=construct_problem(ad.obs['Animal_ID']<=30,'Slc17a8',neighset,oset)
testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']>30),'Slc17a8',neighset,oset)

print(trainX.shape,trainY.shape)
print(testX.shape,testY.shape)

# whiten covariates
mu=np.mean(trainX,axis=0)
sig=np.std(trainX,axis=0)
trainX=(trainX-mu)/sig
testX=(testX-mu)/sig

In [None]:
model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
start = time.time()
model=HistGradientBoostingRegressor(loss="absolute_error")
model.fit(trainX,trainY)
print(np.mean(np.abs(model.predict(testX)-testY)))
end = time.time()
print(end-start)

### Comparison to Standard Scalar

In [None]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

trainX,trainY,feature_names=construct_problem((ad.obs['Animal_ID'] <= 30),'Pak3',neighset,oset)
testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']>30),'Pak3',neighset,oset)

mu=np.mean(trainX,axis=0)
sig=np.std(trainX,axis=0)
trainX_Jackson=(trainX-mu)/sig
testX_Jackson=(testX-mu)/sig

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(trainX)
trainX_Roman = scaler.transform(trainX)

In [None]:
model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
model=sklearn.linear_model.Elastic(alpha=1.0, l1_ratio=0.5)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
loss_dict = {}
for num_nodes in [10,50,100,250,500,1000,2500]:
    for lr in [0.001, 0.01, 0.1]:
        for l2 in [0, 1, 10]:
            model=HistGradientBoostingRegressor(loss="absolute_error", max_leaf_nodes=num_nodes, learning_rate=lr, l2_regularization=l2)
            model.fit(trainX,trainY)
            loss_dict[(num_nodes, lr, l2)] = np.mean(np.abs(model.predict(testX)-testY))
loss_dict

In [None]:
model=HistGradientBoostingRegressor(loss="absolute_error", max_leaf_nodes=1250)
model.fit(trainX,trainY)
np.mean(np.abs(model.predict(testX)-testY))

In [None]:
sum(model.predict(testX))

In [None]:
sum(testY)

# 0 vs. 60 LightGBM Test

### 0 Radius Graph

In [None]:
ad=anndata.read_h5ad(h5ad_location)
row=np.zeros(0,dtype=int)
col=np.zeros(0,dtype=int)
nneigh=10
radius=0
mode="rad"

ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
     connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col=np.r_[col,idxs[E.tocoo().col]]
        row=np.r_[row,idxs[E.tocoo().row]]
    if mode == "rad":
        p=sp.spatial.cKDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges=p.query_pairs(r=radius)
        col=np.r_[col,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row=np.r_[row,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col)),(row,col)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))

In [None]:
results_Jackson_0 = {}
results_Roman_0 = {}

In [None]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

for response_gene in tqdm.notebook.tqdm(response_genes):

    trainX,trainY,feature_names=construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset)
    testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']>30),response_gene,neighset,oset)

    mu=np.mean(trainX,axis=0)
    sig=np.std(trainX,axis=0)
    trainX_Jackson=(trainX-mu)/sig
    testX_Jackson=(testX-mu)/sig
    
    scaler = StandardScaler().fit(trainX)
    trainX_Roman = scaler.transform(trainX)
    testX_Roman = scaler.transform(testX)
    
    model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
    model.fit(trainX_Roman,trainY)
    results_Roman_0[response_gene] = np.mean(np.abs(model.predict(testX_Roman)-testY))
    
    model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
    model.fit(trainX_Jackson,trainY)
    results_Jackson_0[response_gene] = np.mean(np.abs(model.predict(testX_Jackson)-testY))

### 60 Radius Graph

In [None]:
ad=anndata.read_h5ad(h5ad_location)
row=np.zeros(0,dtype=int)
col=np.zeros(0,dtype=int)
nneigh=10
radius=60
mode="rad"

ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
     connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col=np.r_[col,idxs[E.tocoo().col]]
        row=np.r_[row,idxs[E.tocoo().row]]
    if mode == "rad":
        p=sp.spatial.cKDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges=p.query_pairs(r=radius)
        col=np.r_[col,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row=np.r_[row,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col)),(row,col)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))

In [None]:
results_Jackson_60 = {}
results_Roman_60 = {}

In [None]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

# oset=[]
# neighset=[]

for response_gene in tqdm.notebook.tqdm(response_genes):

    trainX,trainY,feature_names=construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset)
    testX,testY,feature_names=construct_problem((ad.obs['Animal_ID']>30),response_gene,neighset,oset)

    mu=np.mean(trainX,axis=0)
    sig=np.std(trainX,axis=0)
    trainX_Jackson=(trainX-mu)/sig
    testX_Jackson=(testX-mu)/sig

    scaler = StandardScaler().fit(trainX)
    trainX_Roman = scaler.transform(trainX)
    testX_Roman = scaler.transform(testX)
    
    model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
    model.fit(trainX_Roman,trainY)
    results_Roman_60[response_gene] = np.mean(np.abs(model.predict(testX_Roman)-testY))
    
    model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
    model.fit(trainX_Jackson,trainY)
    results_Jackson_60[response_gene] = np.mean(np.abs(model.predict(testX_Jackson)-testY))

In [None]:
# write 0 vs. 60 stuff here

with open("Jackson0.json", "r") as json_file:
    results_0_Jackson = json.load(json_file)
    
with open("Jackson60.json", "r") as json_file:
    results_60_Jackson = json.load(json_file)
    
with open("Roman0.json", "r") as json_file:
    results_0_Roman = json.load(json_file)
    
with open("Roman60.json", "r") as json_file:
    results_60_Roman = json.load(json_file)


gene_diffs = {}
for response_gene in response_genes:
    gene_diffs[response_gene] = results_0_Roman[response_gene] - results_60_Roman[response_gene]
gene_diffs

In [None]:
plt.hist(gene_diffs.values())

In [None]:
gene_diff_percent_dict = {}

for response_gene in response_genes:
    gene_diff_percent_dict[response_gene] = 100*(gene_diffs[response_gene])/results_0_Roman[response_gene]

In [None]:
plt.hist(gene_diff_percent_dict.values(), bins=100)

In [None]:
gene_diff_percent_dict

# Synthetic Experiment

In [2]:
dataframe = pd.read_csv(csv_location)

dct={}
for colnm, dtype in zip(dataframe.keys()[:9], dataframe.dtypes[:9]):
    if dtype.kind == "O":
        dct[colnm]=np.require(dataframe[colnm], dtype="U36")
    else:
        dct[colnm]=np.require(dataframe[colnm])
# change expression here to make it synthetic
expression = np.array(dataframe[dataframe.keys()[9:]]).astype(np.float16)
print(expression)
gene_names = np.array(dataframe.keys()[9:], dtype="U80")
cellid=dct.pop('Cell_ID')

ad=anndata.AnnData(
    X=expression,
    var=pd.DataFrame(index=gene_names),
    obs=pd.DataFrame(dct,index=cellid)
)

ad.write_h5ad(h5ad_location)

[[ 0.       1.639   21.3     ...  0.       0.       0.     ]
 [ 0.       0.       1.579   ...  0.       0.       0.     ]
 [ 0.       0.       2.701   ...  0.       0.       0.     ]
 ...
 [ 0.       0.       2.076   ...  0.       0.       0.     ]
 [ 0.       0.      14.516   ...  0.       0.       0.     ]
 [ 0.       0.       0.      ...  0.02643  0.       0.     ]]




In [3]:
ligands=np.array(['Cbln1', 'Cxcl14', 'Cbln2', 'Vgf', 'Scg2', 'Cartpt', 'Tac2',
       'Bdnf', 'Bmp7', 'Cyr61', 'Fn1', 'Fst', 'Gad1', 'Ntng1', 'Pnoc',
       'Selplg', 'Sema3c', 'Sema4d', 'Serpine1', 'Adcyap1', 'Cck', 'Crh',
       'Gal', 'Gnrh1', 'Nts', 'Oxt', 'Penk', 'Sst', 'Tac1', 'Trh', 'Ucn3'])

receptors=np.array(['Crhbp', 'Gabra1', 'Gpr165', 'Glra3', 'Gabrg1', 'Adora2a',
       'Avpr1a', 'Avpr2', 'Brs3', 'Calcr', 'Cckar', 'Cckbr', 'Crhr1',
       'Crhr2', 'Galr1', 'Galr2', 'Grpr', 'Htr2c', 'Igf1r', 'Igf2r',
       'Kiss1r', 'Lepr', 'Lpar1', 'Mc4r', 'Npy1r', 'Npy2r', 'Ntsr1',
       'Oprd1', 'Oprk1', 'Oprl1', 'Oxtr', 'Pdgfra', 'Prlr', 'Ramp3',
       'Rxfp1', 'Slc17a7', 'Slc18a2', 'Tacr1', 'Tacr3', 'Trhr'])

response_genes=np.array(['Ace2', 'Aldh1l1', 'Amigo2', 'Ano3', 'Aqp4', 'Ar', 'Arhgap36',
       'Baiap2', 'Ccnd2', 'Cd24a', 'Cdkn1a', 'Cenpe', 'Chat', 'Coch',
       'Col25a1', 'Cplx3', 'Cpne5', 'Creb3l1', 'Cspg5', 'Cyp19a1',
       'Cyp26a1', 'Dgkk', 'Ebf3', 'Egr2', 'Ermn', 'Esr1', 'Etv1',
       'Fbxw13', 'Fezf1', 'Gbx2', 'Gda', 'Gem', 'Gjc3', 'Greb1',
       'Irs4', 'Isl1', 'Klf4', 'Krt90', 'Lmod1', 'Man1a', 'Mbp', 'Mki67',
       'Mlc1', 'Myh11', 'Ndnf', 'Ndrg1', 'Necab1', 'Nnat', 'Nos1',
       'Npas1', 'Nup62cl', 'Omp', 'Onecut2', 'Opalin', 'Pak3', 'Pcdh11x',
       'Pgr', 'Plin3', 'Pou3f2', 'Rgs2', 'Rgs5', 'Rnd3', 'Scgn',
       'Serpinb1b', 'Sgk1', 'Slc15a3', 'Slc17a6', 'Slc17a8', 'Slco1a4',
       'Sln', 'Sox4', 'Sox6', 'Sox8', 'Sp9', 'Synpr', 'Syt2', 'Syt4',
       'Sytl4', 'Th', 'Tiparp', 'Tmem108', 'Traf4', 'Ttn', 'Ttyh2'])
cell_types = [
        "Ambiguous",
        "Astrocyte",
        "Endothelial 1",
        "Endothelial 2",
        "Endothelial 3",
        "Ependymal",
        "Excitatory",
        "Inhibitory",
        "Microglia",
        "OD Immature 1",
        "OD Immature 2",
        "OD Mature 1",
        "OD Mature 2",
        "OD Mature 3",
        "OD Mature 4",
        "Pericytes",
    ]

In [4]:
with open(genetypes_location,'rb') as f:
    genetypes=pickle.load(f)

In [5]:
neighset=genetypes['ligands']
oset=np.r_[genetypes['ligands'],genetypes['receptors']]
# oset=neighset

In [6]:
# onehot encode cell classes
def oh_encode(lst):
    lst=np.array(lst)
    group_names=np.unique(lst)
    group_indexes=np.zeros((len(lst),len(group_names)),dtype=bool)
    for i,nm in enumerate(group_names):
        group_indexes[lst==nm,i]=True
    return group_names,group_indexes
cell_classes,cell_class_onehots=oh_encode(ad.obs['Cell_class'])

In [7]:
# a function to construct a prediction problem for a subset of cells
import torch

def construct_problem(mask,target_gene,neighbor_genes,self_genes,filter_excitatory=False,synthetic_mode=0):
    '''
    mask -- set of cells
    target_gene -- gene to predict
    neighbor_genes -- names of genes which will be read from neighbors
    self_genes -- names of genes which will be read from target cell
    '''
    
    feature_names = []
    
    # load subset of data relevant to mask
    local_processed_expression=np.log1p(ad.X[mask].astype(float)) # get expression on subset of cells
    local_edges=connectivity_matrix[mask][:,mask] # get edges for subset
    
    true_local_edges=true_connectivity_matrix[mask][:,mask] # get edges for subset
    
    selfset_idxs=[gene_lookup[x] for x in self_genes] # collect the column indexes associated with them
    
    print(local_processed_expression.shape, len(selfset_idxs))
    
    if synthetic_mode == 0:
        cell_volume = 5
        local_processed_expression = (torch.distributions.negative_binomial.NegativeBinomial(1, 0.5).sample(local_processed_expression.shape)/cell_volume).numpy()
        selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
        for i in range(local_processed_expression.shape[0]):
            neighboring_gene1_expr = np.sum(selfset_exprs[[x for x in true_local_edges[i].nonzero()[1]], 1])
            if neighboring_gene1_expr > 1:
                local_processed_expression[i, 0] = neighboring_gene1_expr
            else:
                local_processed_expression[i, 0] = 0

    if synthetic_mode == 1:
        local_processed_expression = torch.distributions.exponential.Exponential(10).rsample(local_processed_expression.shape).numpy()
        selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
        for i in range(local_processed_expression.shape[0]):
            neighboring_gene1_expr = np.sum(selfset_exprs[[x for x in true_local_edges[i].nonzero()[1]], 1])
            if neighboring_gene1_expr > 1:
                local_processed_expression[i, 0] += neighboring_gene1_expr
    
    elif synthetic_mode == 2:
        local_processed_expression = torch.exp(torch.distributions.normal.Normal(0, 1).rsample(local_processed_expression.shape)).numpy()
        selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
        for i in range(local_processed_expression.shape[0]):
            neighboring_gene_expr = np.mean(selfset_exprs[[x for x in true_local_edges[i].nonzero()[1]], 1:10])
            local_processed_expression[i, 0] += neighboring_gene_expr * (2 ** np.sign(neighboring_gene_expr - 1.6))

    elif synthetic_mode == 3:
        local_processed_expression = torch.exp(torch.distributions.normal.Normal(0, 0.25).rsample(local_processed_expression.shape)).numpy()
        selfset_exprs = local_processed_expression[:,selfset_idxs] # collect ligand and receptor expressions
        average_gene1_expr = np.mean(selfset_exprs[:, 1])
        average_gene2_expr = np.mean(selfset_exprs[:, 2])
        for i in range(local_processed_expression.shape[0]):
            neighboring_gene1_expr = np.mean(selfset_exprs[[x for x in true_local_edges[i].nonzero()[1]], 1])
            neighboring_gene2_expr = np.mean(selfset_exprs[[x for x in true_local_edges[i].nonzero()[1]], 2])
            if (np.sign(neighboring_gene1_expr - average_gene1_expr) - np.sign(neighboring_gene2_expr - average_gene2_expr)) > 0:
                local_processed_expression[i, 0] *= 2
            else:
                local_processed_expression[i, 0] *= 0.5
        
    local_processed_expression = np.log1p(local_processed_expression)
    selfset_exprs = np.log1p(selfset_exprs)
    feature_names += [x for x in self_genes]
    
    neighborset_idxs=[gene_lookup[x] for x in neighbor_genes] # collect the column indexes associated with them
    neighset_exprs = local_processed_expression[:,neighborset_idxs] # collect ligand and receptor expressions
    
    feature_names += [x + " from Neighbors" for x in neighbor_genes]
    
    n_neighs=(local_edges@np.ones(local_edges.shape[0]))
    # print(local_edges)
    # print(n_neighs)
    neigh_avgs = (local_edges@neighset_exprs) / n_neighs[:,None] # average ligand/receptor for neighbors
    
    neigh_cellclass_avgs = (local_edges@cell_class_onehots[mask]) / n_neighs[:,None] # celltype simplex
    
    feature_names += [f"Cell Class {cell_types[x]}" for x in range(16)]
    
    positions=np.array(ad.obs[['Centroid_X','Centroid_Y','Bregma']])[mask] # get positions
    
    feature_names += ['Centroid_X','Centroid_Y','Bregma']
    
    covariates=np.c_[selfset_exprs,neigh_avgs,neigh_cellclass_avgs,positions] # collect all covariates
    predict = local_processed_expression[:,gene_lookup[target_gene]] # collect what we're supposed to predict
    
    # print(selfset_exprs.shape, neigh_avgs.shape, neigh_cellclass_avgs.shape, positions.shape)
    
    if filter_excitatory:
    
        excites=(ad.obs['Cell_class']=='Excitatory')[mask] # get the subset of these cells which are excitatory
        covariates=covariates[excites] # subset to excites
        predict=predict[excites]       # subset to excites
    
    return covariates,predict,feature_names

# Synthetic Data Creation

In [8]:
dataframe = pd.read_csv(csv_location)

dct={}
for colnm, dtype in zip(dataframe.keys()[:9], dataframe.dtypes[:9]):
    if dtype.kind == "O":
        dct[colnm]=np.require(dataframe[colnm], dtype="U36")
    else:
        dct[colnm]=np.require(dataframe[colnm])
expression = np.array(dataframe[dataframe.keys()[9:]]).astype(np.float16)
gene_names = np.array(dataframe.keys()[9:], dtype="U80")
cellid=dct.pop('Cell_ID')

ad=anndata.AnnData(
    X=expression,
    var=pd.DataFrame(index=gene_names),
    obs=pd.DataFrame(dct,index=cellid)
)

ad.write_h5ad(h5ad_location)

ad=anndata.read_h5ad(h5ad_location)
animal_ids=np.unique(ad.obs['Animal_ID'])
bregmas=np.unique(ad.obs['Bregma'])
tissue_id=np.zeros(len(ad),dtype=int)
n_tissues=0
    
for aid in animal_ids:
    for bregma in bregmas:
        good=(ad.obs['Animal_ID']==aid)&(ad.obs['Bregma']==bregma)
        if np.sum(good)>0:
            tissue_id[good]=n_tissues
            n_tissues+=1
ad.obs['Tissue_ID']=tissue_id
ad.write_h5ad(h5ad_location)



# Building Radius 30 Graph (Ground Truth)

In [10]:
ad=anndata.read_h5ad(h5ad_location)
row_edge_idxs=np.zeros(0,dtype=int)
col_edge_idxs=np.zeros(0,dtype=int)
row_distance_idxs=np.zeros(0,dtype=int)
col_distance_idxs=np.zeros(0,dtype=int)
radius=30
mode="rad"

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col_edge_idxs=np.r_[col_edge_idxs,idxs[E.tocoo().col]]
        row_edge_idxs=np.r_[row_edge_idxs,idxs[E.tocoo().row]]
    if mode == "rad":
        p = sklearn.neighbors.KDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges, distances = p.query_radius(pos, r=radius, return_distance=True)
        distances = np.concatenate(
            [
                np.c_[
                    np.repeat(i, len(distances[i])),
                    list(distances[i]),
                ]
                for i in range(len(distances))
            ],
            axis=0,
        )
        edges = np.concatenate(
            [
                np.c_[
                    np.repeat(i, len(edges[i])),
                    list(edges[i]),
                ]
                for i in range(len(edges))
            ],
            axis=0,
        )
        print(edges, distances)
        # construct edges
        col_edge_idxs=np.r_[col_edge_idxs,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row_edge_idxs=np.r_[row_edge_idxs,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]
        # construct distances
        col_distance_idxs=np.r_[col_distance_idxs, np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row_distance_idxs=np.r_[row_distance_idxs, np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col_edge_idxs)),(row_edge_idxs,col_edge_idxs)),shape=(len(ad),len(ad)))).tocsr()
D=(scipy.sparse.diags([0] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col_edge_idxs)),(row_edge_idxs,col_edge_idxs)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))
    
# load data
# These are set above. You can change these here if you want though.
# radius=60
# mode="rad"
ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    true_connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
    true_connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

  0%|          | 0/181 [00:00<?, ?it/s]

[[   0   16]
 [   0   22]
 [   0    0]
 ...
 [6508 5864]
 [6508 5926]
 [6508 6508]] [[   0.           14.26106082]
 [   0.           28.70564074]
 [   0.            0.        ]
 ...
 [6508.           29.17129923]
 [6508.           18.74110798]
 [6508.            0.        ]]
[[   0    0]
 [   0    3]
 [   0    1]
 ...
 [6411 5935]
 [6411 5931]
 [6411 6411]] [[   0.            0.        ]
 [   0.           26.1171927 ]
 [   0.            7.78001864]
 ...
 [6411.           15.13073063]
 [6411.           27.26031875]
 [6411.            0.        ]]
[[   0    2]
 [   0   41]
 [   0    0]
 ...
 [6506 6506]
 [6506 6004]
 [6506 4800]] [[   0.           28.48980905]
 [   0.           28.55507935]
 [   0.            0.        ]
 ...
 [6506.            0.        ]
 [6506.           17.42164136]
 [6506.           26.48132769]]
[[   0    4]
 [   0    2]
 [   0    8]
 ...
 [6604 6114]
 [6604 6604]
 [6604 6119]] [[   0.           20.23505585]
 [   0.           10.11973405]
 [   0.           27.84428

[[   0    1]
 [   0    3]
 [   0    6]
 ...
 [6235 6235]
 [6235 4655]
 [6235 4645]] [[   0.            9.79182618]
 [   0.           19.44744633]
 [   0.           25.72054996]
 ...
 [6235.            0.        ]
 [6235.           24.88805127]
 [6235.           16.83906556]]
[[   0    4]
 [   0   57]
 [   0    0]
 ...
 [5693 5267]
 [5693 5265]
 [5693 5693]] [[   0.           26.95722331]
 [   0.           13.76663715]
 [   0.            0.        ]
 ...
 [5693.           20.01847085]
 [5693.           28.2939286 ]
 [5693.            0.        ]]
[[   0   46]
 [   0    0]
 [   1    3]
 ...
 [6035 5557]
 [6035 5561]
 [6035 5560]] [[0.00000000e+00 1.31016727e+01]
 [0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 1.63136586e+01]
 ...
 [6.03500000e+03 2.67390018e+01]
 [6.03500000e+03 2.41435555e+01]
 [6.03500000e+03 2.37262051e+01]]
[[   0    3]
 [   0    0]
 [   1    1]
 ...
 [4688 4355]
 [4688 4688]
 [4688 4351]] [[0.00000000e+00 1.63546431e+01]
 [0.00000000e+00 0.00000000e+00]
 [1.000000

[[   0    1]
 [   0    0]
 [   1    1]
 ...
 [5982 5422]
 [5982 5423]
 [5982 5424]] [[0.00000000e+00 2.13146457e+01]
 [0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 0.00000000e+00]
 ...
 [5.98200000e+03 1.79088512e+01]
 [5.98200000e+03 1.08012009e+01]
 [5.98200000e+03 2.75733849e+01]]
[[   0    0]
 [   1    2]
 [   1    4]
 ...
 [6010 6010]
 [6010 6009]
 [6010 5548]] [[0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 2.76101394e+01]
 [1.00000000e+00 2.73690483e+01]
 ...
 [6.01000000e+03 0.00000000e+00]
 [6.01000000e+03 2.04075716e+01]
 [6.01000000e+03 1.33498078e+01]]
[[   0    0]
 [   0   18]
 [   0   22]
 ...
 [5969 5472]
 [5969 5470]
 [5969 5880]] [[   0.            0.        ]
 [   0.           13.10954498]
 [   0.           29.59488554]
 ...
 [5969.           10.62696783]
 [5969.           29.58810578]
 [5969.           20.63027641]]
[[   0    3]
 [   0    4]
 [   0    7]
 ...
 [6416 5960]
 [6416 5982]
 [6416 5952]] [[   0.           25.81127605]
 [   0.           13.07728074]
 [

[[   0    1]
 [   0   43]
 [   0   40]
 ...
 [5725 5725]
 [5725 5317]
 [5725 5321]] [[   0.           25.80788628]
 [   0.           27.28094257]
 [   0.           18.93345905]
 ...
 [5725.            0.        ]
 [5725.            6.71161777]
 [5725.           24.52192985]]
[[   0    5]
 [   0    6]
 [   0    0]
 ...
 [5789 5351]
 [5789 5345]
 [5789 5789]] [[   0.           21.55957475]
 [   0.           14.97809096]
 [   0.            0.        ]
 ...
 [5789.           23.69349116]
 [5789.           18.64461517]
 [5789.            0.        ]]
[[   0   67]
 [   0   65]
 [   0    0]
 ...
 [5562 5080]
 [5562 5562]
 [5562 5124]] [[   0.           25.04782403]
 [   0.           10.48583939]
 [   0.            0.        ]
 ...
 [5562.           12.92224259]
 [5562.            0.        ]
 [5562.           26.1235385 ]]
[[   0    0]
 [   0    5]
 [   0   43]
 ...
 [5650 5650]
 [5650 5648]
 [5651 5651]] [[   0.            0.        ]
 [   0.           14.86303568]
 [   0.           27.08741

[[   0    0]
 [   0    3]
 [   0 4764]
 ...
 [5128 4746]
 [5128 4709]
 [5128 4738]] [[   0.            0.        ]
 [   0.           11.38585262]
 [   0.           26.17454736]
 ...
 [5128.           18.53769454]
 [5128.           28.51319153]
 [5128.           13.11081587]]
[[   0    0]
 [   0   55]
 [   0 5031]
 ...
 [5433 4979]
 [5433 4984]
 [5433 5425]] [[   0.            0.        ]
 [   0.           17.45806044]
 [   0.           11.40851962]
 ...
 [5433.           13.4429493 ]
 [5433.           28.06534436]
 [5433.           24.18200539]]
[[   0    0]
 [   0    9]
 [   1    4]
 ...
 [5768 5768]
 [5769 5284]
 [5769 5769]] [[0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 2.75624627e+01]
 [1.00000000e+00 2.31651638e+01]
 ...
 [5.76800000e+03 0.00000000e+00]
 [5.76900000e+03 2.44918793e+01]
 [5.76900000e+03 0.00000000e+00]]
[[   0    4]
 [   0    0]
 [   1    1]
 ...
 [5623 5098]
 [5623 5618]
 [5623 5623]] [[0.00000000e+00 2.29489688e+01]
 [0.00000000e+00 0.00000000e+00]
 [1.000000

[[   0    2]
 [   0    0]
 [   1    1]
 ...
 [5814 5814]
 [5814 5391]
 [5814 5392]] [[0.00000000e+00 2.13737757e+01]
 [0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 0.00000000e+00]
 ...
 [5.81400000e+03 0.00000000e+00]
 [5.81400000e+03 1.50512977e+01]
 [5.81400000e+03 2.19427267e+01]]
[[   0    4]
 [   0    0]
 [   0 5062]
 ...
 [5443 5002]
 [5443 5443]
 [5443 5014]] [[0.00000000e+00 2.98990904e+01]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 4.31873388e+00]
 ...
 [5.44300000e+03 2.79104654e+01]
 [5.44300000e+03 0.00000000e+00]
 [5.44300000e+03 1.33273752e+01]]
[[   0    0]
 [   0    6]
 [   1    1]
 ...
 [5510 5457]
 [5510 5510]
 [5510 5034]] [[0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 2.15775446e+01]
 [1.00000000e+00 0.00000000e+00]
 ...
 [5.51000000e+03 9.40177178e+00]
 [5.51000000e+03 0.00000000e+00]
 [5.51000000e+03 1.66246802e+01]]
[[   0    1]
 [   0    6]
 [   0    4]
 ...
 [5752 5197]
 [5753 5314]
 [5753 5753]] [[   0.            7.64193782]
 [   0.           28.

[[   0    1]
 [   0    2]
 [   0   52]
 ...
 [5398 5398]
 [5398 5004]
 [5398 4968]] [[   0.           24.58939159]
 [   0.           13.77727661]
 [   0.           21.36914653]
 ...
 [5398.            0.        ]
 [5398.           20.59418215]
 [5398.           28.56316697]]




# Building Radius 0 Graph

In [None]:
ad=anndata.read_h5ad(h5ad_location)
row=np.zeros(0,dtype=int)
col=np.zeros(0,dtype=int)
radius=0
mode="rad"

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col=np.r_[col,idxs[E.tocoo().col]]
        row=np.r_[row,idxs[E.tocoo().row]]
    if mode == "rad":
        p=sp.spatial.cKDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges=p.query_pairs(r=radius)
        col=np.r_[col,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row=np.r_[row,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col)),(row,col)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))
    
# load data
# These are set above. You can change these here if you want though.
# radius=60
# mode="rad"
ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

### Synthetic Experiment 0

In [None]:
results_0_0 = {}

response_gene = "Ace2"

trainX_0_0, trainY_0_0, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=0)
testX_0_0, testY_0_0, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=0)

scaler = StandardScaler().fit(trainX_0_0)
trainX_0_0 = scaler.transform(trainX_0_0)
testX_0_0 = scaler.transform(testX_0_0)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=10, tol=0.001)
model.fit(trainX_0_0, trainY_0_0)
results_0_0["LightGBM"] = np.mean(np.abs(model.predict(testX_0_0)-testY_0_0))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_0_0, trainY_0_0)
results_0_0["Ridge"] = np.mean(np.abs(model.predict(testX_0_0)-testY_0_0))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_0_0, trainY_0_0)
results_0_0["Lasso"] = np.mean(np.abs(model.predict(testX_0_0)-testY_0_0))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_0_0, trainY_0_0)
results_0_0["ElasticNet"] = np.mean(np.abs(model.predict(testX_0_0)-testY_0_0))

In [None]:
results_0_0

### Synthetic Experiment 1

In [None]:
results_1_0 = {}

response_gene = "Ace2"

trainX_1_0, trainY_1_0, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=1)
testX_1_0, testY_1_0, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=1)

scaler = StandardScaler().fit(trainX_1_0)
trainX_1_0 = scaler.transform(trainX_1_0)
testX_1_0 = scaler.transform(testX_1_0)

model = HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_1_0, trainY_1_0)
results_1_0["LightGBM"] = np.mean(np.abs(model.predict(testX_1_0)-testY_1_0))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_1_0, trainY_1_0)
results_1_0["Ridge"] = np.mean(np.abs(model.predict(testX_1_0)-testY_1_0))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_1_0, trainY_1_0)
results_1_0["Lasso"] = np.mean(np.abs(model.predict(testX_1_0)-testY_1_0))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_1_0, trainY_1_0)
results_1_0["ElasticNet"] = np.mean(np.abs(model.predict(testX_1_0)-testY_1_0))

### Synthetic Experiment 2

In [None]:
results_2_0 = {}

response_gene = "Ace2"

trainX_2_0, trainY_2_0, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=2)
testX_2_0, testY_2_0, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=2)

scaler = StandardScaler().fit(trainX_2_0)
trainX_2_0 = scaler.transform(trainX_2_0)
testX_2_0 = scaler.transform(testX_2_0)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_2_0, trainY_2_0)
results_2_0["LightGBM"] = np.mean(np.abs(model.predict(testX_2_0)-testY_2_0))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_2_0, trainY_2_0)
results_2_0["Ridge"] = np.mean(np.abs(model.predict(testX_2_0)-testY_2_0))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_2_0, trainY_2_0)
results_2_0["Lasso"] = np.mean(np.abs(model.predict(testX_2_0)-testY_2_0))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_2_0, trainY_2_0)
results_2_0["ElasticNet"] = np.mean(np.abs(model.predict(testX_2_0)-testY_2_0))

### Synthetic Experiment 3

In [None]:
results_3_0 = {}

response_gene = "Ace2"

trainX_3_0, trainY_3_0, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=3)
testX_3_0, testY_3_0, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=3)

scaler = StandardScaler().fit(trainX_3_0)
trainX_3_0 = scaler.transform(trainX_3_0)
testX_3_0 = scaler.transform(testX_3_0)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_3_0, trainY_3_0)
results_3_0["LightGBM"] = np.mean(np.abs(model.predict(testX_3_0)-testY_3_0))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_3_0, trainY_3_0)
results_3_0["Ridge"] = np.mean(np.abs(model.predict(testX_3_0)-testY_3_0))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_3_0, trainY_3_0)
results_3_0["Lasso"] = np.mean(np.abs(model.predict(testX_3_0)-testY_3_0))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_3_0, trainY_3_0)
results_3_0["ElasticNet"] = np.mean(np.abs(model.predict(testX_3_0)-testY_3_0))

In [None]:
results_0_0, results_1_0, results_2_0, results_3_0

### Building Radius 60 Graph

In [None]:
ad=anndata.read_h5ad(h5ad_location)
row=np.zeros(0,dtype=int)
col=np.zeros(0,dtype=int)
radius=60
mode="rad"

for tid in tqdm.notebook.tqdm(np.unique(ad.obs['Tissue_ID'])):
    good=ad.obs['Tissue_ID']==tid
    pos=np.array(ad.obs[good][['Centroid_X','Centroid_Y']])
    idxs=np.where(good)[0]
    if mode == "neighbors":
        if nneigh == 0:
            E = csr_matrix(np.eye(pos.shape[0]))
        else:
            p=sklearn.neighbors.BallTree(pos)
            E=sklearn.neighbors.kneighbors_graph(pos,nneigh,mode='connectivity')
        col=np.r_[col,idxs[E.tocoo().col]]
        row=np.r_[row,idxs[E.tocoo().row]]
    if mode == "rad":
        p=sp.spatial.cKDTree(pos)
        # E=p.query_ball_point(pos, r=radius, return_sorted=False)
        edges=p.query_pairs(r=radius)
        col=np.r_[col,np.concatenate((idxs[[y for (x,y) in edges]], idxs[[x for (x,y) in edges]]))]
        row=np.r_[row,np.concatenate((idxs[[x for (x,y) in edges]], idxs[[y for (x,y) in edges]]))]

E=(scipy.sparse.diags([1] * len(ad), 0) + sp.sparse.coo_matrix((np.ones(len(col)),(row,col)),shape=(len(ad),len(ad)))).tocsr()

if mode == "neighbors":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(nneigh, mode))
if mode == "rad":
    anndata.AnnData(E).write_h5ad(connectivity_matrix_template%(radius, mode))
    
# load data
# These are set above. You can change these here if you want though.
# radius=60
# mode="rad"
ad=anndata.read_h5ad(h5ad_location)
if mode == "neighbors":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(nneigh,mode)).X
if mode == "rad":
    connectivity_matrix=anndata.read_h5ad(connectivity_matrix_template%(radius,mode)).X
gene_lookup={x:i for (i,x) in enumerate(ad.var.index)}

### Synthetic Experiment 0

In [None]:
results_0_60 = {}

response_gene = "Ace2"

trainX_0_60, trainY_0_60, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=0)
testX_0_60, testY_0_60, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=0)

scaler = StandardScaler().fit(trainX_0_60)
trainX_0_60 = scaler.transform(trainX_0_60)
testX_0_60 = scaler.transform(testX_0_60)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_0_60, trainY_0_60)
results_0_60["LightGBM"] = np.mean(np.abs(model.predict(testX_0_60)-testY_0_60))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_0_60, trainY_0_60)
results_0_60["Ridge"] = np.mean(np.abs(model.predict(testX_0_60)-testY_0_60))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_0_60, trainY_0_60)
results_0_60["Lasso"] = np.mean(np.abs(model.predict(testX_0_60)-testY_0_60))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_0_60, trainY_0_60)
results_0_60["ElasticNet"] = np.mean(np.abs(model.predict(testX_0_60)-testY_0_60))

### Synthetic Experiment 1

In [None]:
results_1_60 = {}

response_gene = "Ace2"

trainX_1_60, trainY_1_60, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=1)
testX_1_60, testY_1_60, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=1)

scaler = StandardScaler().fit(trainX_1_60)
trainX_1_60 = scaler.transform(trainX_1_60)
testX_1_60 = scaler.transform(testX_1_60)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_1_60, trainY_1_60)
results_1_60["LightGBM"] = np.mean(np.abs(model.predict(testX_1_60)-testY_1_60))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_1_60, trainY_1_60)
results_1_60["Ridge"] = np.mean(np.abs(model.predict(testX_1_60)-testY_1_60))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_1_60, trainY_1_60)
results_1_60["Lasso"] = np.mean(np.abs(model.predict(testX_1_60)-testY_1_60))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_1_60, trainY_1_60)
results_1_60["ElasticNet"] = np.mean(np.abs(model.predict(testX_1_60)-testY_1_60))

### Synthetic Experiment 2

In [None]:
results_2_60 = {}

response_gene = "Ace2"

trainX_2_60, trainY_2_60, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=2)
testX_2_60, testY_2_60, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=2)

scaler = StandardScaler().fit(trainX_2_60)
trainX_2_60 = scaler.transform(trainX_2_60)
testX_2_60 = scaler.transform(testX_2_60)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_2_60, trainY_2_60)
results_2_60["LightGBM"] = np.mean(np.abs(model.predict(testX_2_60)-testY_2_60))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_2_60, trainY_2_60)
results_2_60["Ridge"] = np.mean(np.abs(model.predict(testX_2_60)-testY_2_60))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_2_60, trainY_2_60)
results_2_60["Lasso"] = np.mean(np.abs(model.predict(testX_2_60)-testY_2_60))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_2_60, trainY_2_60)
results_2_60["ElasticNet"] = np.mean(np.abs(model.predict(testX_2_60)-testY_2_60))

### Synthetic Experiment 3

In [None]:
results_3_60 = {}

response_gene = "Ace2"

trainX_3_60, trainY_3_60, feature_names = construct_problem((ad.obs['Animal_ID'] <= 30),response_gene,neighset,oset, synthetic_mode=3)
testX_3_60, testY_3_60, feature_names = construct_problem((ad.obs['Animal_ID'] > 30),response_gene,neighset,oset, synthetic_mode=3)

scaler = StandardScaler().fit(trainX_3_60)
trainX_3_60 = scaler.transform(trainX_3_60)
testX_3_60 = scaler.transform(testX_3_60)

model=HistGradientBoostingRegressor(loss="squared_error", min_samples_leaf=2, verbose=1, random_state=129, max_iter=1000, n_iter_no_change=25)
model.fit(trainX_3_60, trainY_3_60)
results_3_60["LightGBM"] = np.mean(np.abs(model.predict(testX_3_60)-testY_3_60))

model=sklearn.linear_model.Ridge(alpha=1.0)
model.fit(trainX_3_60, trainY_3_60)
results_3_60["Ridge"] = np.mean(np.abs(model.predict(testX_3_60)-testY_3_60))

model=sklearn.linear_model.Lasso(alpha=1.0)
model.fit(trainX_3_60, trainY_3_60)
results_3_60["Lasso"] = np.mean(np.abs(model.predict(testX_3_60)-testY_3_60))

model=sklearn.linear_model.ElasticNet(alpha=1.0, l1_ratio=0.5)
model.fit(trainX_3_60, trainY_3_60)
results_3_60["ElasticNet"] = np.mean(np.abs(model.predict(testX_3_60)-testY_3_60))

In [None]:
results_0_60, results_1_60, results_2_60, results_3_60