In [1]:
import anndata
import torch
import stPlus

import squidpy as sq
import numpy as np
import scanpy as sc
import pandas as pd

from sklearn.model_selection import KFold
from transpa.eval_util import calc_corr
from transpa.util import expTransImp, leiden_cluster, compute_autocorr
from benchmark import SpaGE_impute, Tangram_impute
import warnings

warnings.filterwarnings('ignore')

seed = 10
device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
spa_adata = sc.read_h5ad("../../data/ST/seqfish/seqfish_data.h5ad")
scrna_adata = sc.read_h5ad("../../data/scRNAseq/seqfish/scRNAseq_seqfish.h5ad")
classes, ct_list = leiden_cluster(scrna_adata)
cls_key = 'leiden'
sc.pp.normalize_total(spa_adata)
sc.pp.normalize_total(scrna_adata)
sc.pp.log1p(spa_adata)
sc.pp.log1p(scrna_adata)

var_name = scrna_adata.var_names.values.copy()
var_name[np.argmax((scrna_adata.var_names == "Prkcdbp"))] = "Cavin3"
scrna_adata.var_names = var_name

scrna_adata.obs[cls_key] = classes

In [3]:
raw_spatial_df  = pd.DataFrame(spa_adata.X.toarray(), columns=spa_adata.var_names)
raw_spatial_df.to_csv('../../output/seqfish_raw.csv')
raw_scrna_df    = pd.DataFrame(scrna_adata.X.toarray(), columns=scrna_adata.var_names).astype(pd.SparseDtype("float32", 0))
raw_shared_gene = np.intersect1d(raw_spatial_df.columns, raw_scrna_df.columns)
raw_spatial_df.shape, raw_scrna_df.shape, raw_shared_gene.shape,

((57536, 351), (32844, 29452), (351,))

In [4]:
spa_adata.obsm['spatial'] = np.hstack([spa_adata.obs.x_global_affine.values.reshape(-1,1), spa_adata.obs.y_global_affine.values.reshape(-1,1)])
np.save('../../output/seqfish_locations.npy', spa_adata.obsm['spatial'])
sq.gr.spatial_neighbors(spa_adata)


In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)
kf.get_n_splits(raw_shared_gene)

df_transImpSpa = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_transImpCls = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_transImpClsSpa = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_transImp = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_stplus_res = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_spaGE_res = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)
df_tangram_res = pd.DataFrame(np.zeros((spa_adata.n_obs, len(raw_shared_gene))), columns=raw_shared_gene)

for idx, (train_ind, test_ind) in enumerate(kf.split(raw_shared_gene)):    
    print(f"\n===== Fold {idx+1} =====\nNumber of train genes: {len(train_ind)}, Number of test genes: {len(test_ind)}")
    train_gene = raw_shared_gene[train_ind]
    test_gene  = raw_shared_gene[test_ind]
    
    test_spatial_df = raw_spatial_df[test_gene]
    spatial_df = raw_spatial_df[train_gene]
    scrna_df   = raw_scrna_df

    df_transImpSpa[test_gene] = expTransImp(
            df_ref=raw_scrna_df,
            df_tgt=raw_spatial_df,
            train_gene=train_gene,
            test_gene=test_gene,
            signature_mode='cell',
            mapping_mode='lowrank',
            mapping_lowdim=128,
            spa_adj=spa_adata.obsp['spatial_connectivities'].tocoo(),
            seed=seed,
            device=device)

    corr_transImp_res = calc_corr(raw_spatial_df, df_transImpSpa, test_gene)
    print(f'fold {idx}, median correlation: {np.median(corr_transImp_res)} (TransImpSpa)')

    df_transImpCls[test_gene] = expTransImp(
            df_ref=raw_scrna_df,
            df_tgt=raw_spatial_df,
            train_gene=train_gene,
            test_gene=test_gene,
            ct_list=ct_list,
            classes=classes,
            signature_mode='cluster',
            mapping_mode='full',
            seed=seed,
            device=device)

    corr_transImp_res = calc_corr(raw_spatial_df, df_transImpCls, test_gene)
    print(f'fold {idx}, median correlation: {np.median(corr_transImp_res)} (TransImpCls)')

    df_transImp[test_gene] = expTransImp(
            df_ref=raw_scrna_df,
            df_tgt=raw_spatial_df,
            train_gene=train_gene,
            test_gene=test_gene,
            signature_mode='cell',
            mapping_mode='lowrank',
            mapping_lowdim=128,
            seed=seed,
            device=device)

    corr_transImp_res = calc_corr(raw_spatial_df, df_transImp, test_gene)
    print(f'fold {idx}, median correlation: {np.median(corr_transImp_res)} (TransImp)')

    df_transImpClsSpa[test_gene] = expTransImp(
            df_ref=raw_scrna_df,
            df_tgt=raw_spatial_df,
            train_gene=train_gene,
            test_gene=test_gene,
            ct_list=ct_list,
            classes=classes,
            spa_adj=spa_adata.obsp['spatial_connectivities'].tocoo(),
            signature_mode='cluster',
            mapping_mode='full',
            wt_spa=1.0,
            seed=seed,
            device=device)

    corr_transImp_res = calc_corr(raw_spatial_df, df_transImpClsSpa, test_gene)
    print(f'fold {idx}, median correlation: {np.median(corr_transImp_res)} (TransImpClsSpa)')

    df_stplus_res[test_gene] = stPlus.stPlus(spatial_df, scrna_df, test_gene, "tmp_ug", verbose=False, random_seed=seed, device=device)
    corr_res_stplus = calc_corr(raw_spatial_df, df_stplus_res, test_gene)
    print(f'\t\t\t{np.median(corr_res_stplus)} (stPlus)')

    df_spaGE_res[test_gene]  = SpaGE_impute(scrna_df.sparse.to_dense(), spatial_df, train_gene, test_gene)
    corr_res_spaGE = calc_corr(raw_spatial_df, df_spaGE_res, test_gene)
    print(f'\t\t\t{np.median(corr_res_spaGE)} (spaGE)')

    df_tangram_res[test_gene] = Tangram_impute(scrna_adata, spa_adata, train_gene, test_gene, device, cls_key)
    corr_res_tangram = calc_corr(raw_spatial_df, df_tangram_res, test_gene)
    print(f'\t\t\t{np.median(corr_res_tangram)} (Tangram)')

corr_transImpSpa_res = calc_corr(raw_spatial_df, df_transImpSpa, raw_shared_gene)
corr_transImp_res = calc_corr(raw_spatial_df, df_transImp, raw_shared_gene)
corr_transImpCls_res = calc_corr(raw_spatial_df, df_transImpCls, raw_shared_gene)
corr_transImpClsSpa_res = calc_corr(raw_spatial_df, df_transImpClsSpa, raw_shared_gene)
corr_res_stplus = calc_corr(raw_spatial_df, df_stplus_res, raw_shared_gene)
corr_res_spaGE = calc_corr(raw_spatial_df, df_spaGE_res, raw_shared_gene)
corr_res_tangram = calc_corr(raw_spatial_df, df_tangram_res, raw_shared_gene)   

print(np.median(corr_transImpSpa_res), "(TransImpSpa)", 
      np.median(corr_transImp_res), "(TransImp)", 
      np.median(corr_transImpCls_res), "(TransImpCls)", 
      np.median(corr_transImpClsSpa_res), "(TransImpClsSpa)", 
      np.median(corr_res_stplus), "(stPlus)", 
      np.median(corr_res_spaGE), "(spaGE)",
      np.median(corr_res_tangram), "(Tangram)"
      )


===== Fold 1 =====
Number of train genes: 280, Number of test genes: 71


[TransImp] Epoch: 1000/1000, loss: 0.847425, (IMP) 0.844075, (SPA) 1.0 x 0.003350: 100%|██████████| 1000/1000 [00:42<00:00, 23.55it/s]


fold 0, median correlation: 0.18080570627837816 (TransImpSpa)


[TransImp] Epoch: 1000/1000, loss: 0.902676, (IMP) 0.902676, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:15<00:00, 65.04it/s]


fold 0, median correlation: 0.17367597002967328 (TransImpCls)


[TransImp] Epoch: 1000/1000, loss: 0.805868, (IMP) 0.805868, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:20<00:00, 49.95it/s]


fold 0, median correlation: 0.18306017351851267 (TransImp)


[TransImp] Epoch: 1000/1000, loss: 0.913085, (IMP) 0.908033, (SPA) 1.0 x 0.005052: 100%|██████████| 1000/1000 [00:41<00:00, 24.25it/s]


fold 0, median correlation: 0.17034470761801615 (TransImpClsSpa)
			0.13058364246942666 (stPlus)
			0.10781539544899034 (spaGE)


INFO:root:280 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:280 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 280 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.383, KL reg: 0.062
Score: 0.568, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001
Score: 0.570, KL reg: 0.001


INFO:root:Saving results..


			0.18467681550712478 (Tangram)

===== Fold 2 =====
Number of train genes: 281, Number of test genes: 70


[TransImp] Epoch: 1000/1000, loss: 0.858401, (IMP) 0.855613, (SPA) 1.0 x 0.002788: 100%|██████████| 1000/1000 [00:40<00:00, 24.49it/s]


fold 1, median correlation: 0.20823262726053696 (TransImpSpa)


[TransImp] Epoch: 1000/1000, loss: 0.916604, (IMP) 0.916604, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:12<00:00, 77.22it/s]


fold 1, median correlation: 0.21322974189758756 (TransImpCls)


[TransImp] Epoch: 1000/1000, loss: 0.817290, (IMP) 0.817290, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:16<00:00, 58.91it/s]


fold 1, median correlation: 0.21576731773613872 (TransImp)


[TransImp] Epoch: 1000/1000, loss: 0.926989, (IMP) 0.921963, (SPA) 1.0 x 0.005026: 100%|██████████| 1000/1000 [00:36<00:00, 27.70it/s]


fold 1, median correlation: 0.20518543235003572 (TransImpClsSpa)
			0.1705960399232355 (stPlus)
			0.15278006984415318 (spaGE)


INFO:root:281 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:281 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 281 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.393, KL reg: 0.063
Score: 0.566, KL reg: 0.001
Score: 0.568, KL reg: 0.001
Score: 0.568, KL reg: 0.001
Score: 0.568, KL reg: 0.001
Score: 0.568, KL reg: 0.001
Score: 0.568, KL reg: 0.001
Score: 0.569, KL reg: 0.001
Score: 0.569, KL reg: 0.001
Score: 0.569, KL reg: 0.001


INFO:root:Saving results..


			0.22339779836725637 (Tangram)

===== Fold 3 =====
Number of train genes: 281, Number of test genes: 70


[TransImp] Epoch: 1000/1000, loss: 0.872519, (IMP) 0.869531, (SPA) 1.0 x 0.002988: 100%|██████████| 1000/1000 [00:40<00:00, 24.57it/s]


fold 2, median correlation: 0.23272647058933282 (TransImpSpa)


[TransImp] Epoch: 1000/1000, loss: 0.932181, (IMP) 0.932181, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:12<00:00, 78.97it/s]


fold 2, median correlation: 0.24174851829847532 (TransImpCls)


[TransImp] Epoch: 1000/1000, loss: 0.836944, (IMP) 0.836944, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:17<00:00, 57.80it/s]


fold 2, median correlation: 0.24869675685527717 (TransImp)


[TransImp] Epoch: 1000/1000, loss: 0.942761, (IMP) 0.937593, (SPA) 1.0 x 0.005168: 100%|██████████| 1000/1000 [00:36<00:00, 27.35it/s]


fold 2, median correlation: 0.22653229916226403 (TransImpClsSpa)
			0.21195457113122587 (stPlus)
			0.1867355532571227 (spaGE)


INFO:root:281 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:281 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 281 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.379, KL reg: 0.063
Score: 0.559, KL reg: 0.001
Score: 0.560, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001
Score: 0.561, KL reg: 0.001


INFO:root:Saving results..


			0.24052117073729795 (Tangram)

===== Fold 4 =====
Number of train genes: 281, Number of test genes: 70


[TransImp] Epoch: 1000/1000, loss: 0.862432, (IMP) 0.859247, (SPA) 1.0 x 0.003184: 100%|██████████| 1000/1000 [00:40<00:00, 24.50it/s]


fold 3, median correlation: 0.21398465707465497 (TransImpSpa)


[TransImp] Epoch: 1000/1000, loss: 0.913874, (IMP) 0.913874, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:12<00:00, 78.52it/s]


fold 3, median correlation: 0.20390489464765393 (TransImpCls)


[TransImp] Epoch: 1000/1000, loss: 0.821982, (IMP) 0.821982, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:17<00:00, 57.77it/s]


fold 3, median correlation: 0.2243135007735513 (TransImp)


[TransImp] Epoch: 1000/1000, loss: 0.924603, (IMP) 0.919673, (SPA) 1.0 x 0.004930: 100%|██████████| 1000/1000 [00:36<00:00, 27.68it/s]


fold 3, median correlation: 0.2016345485748509 (TransImpClsSpa)
			0.15835510257683405 (stPlus)
			0.13586503355193041 (spaGE)


INFO:root:281 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:281 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 281 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.382, KL reg: 0.063
Score: 0.562, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001
Score: 0.564, KL reg: 0.001


INFO:root:Saving results..


			0.2161361202777044 (Tangram)

===== Fold 5 =====
Number of train genes: 281, Number of test genes: 70


[TransImp] Epoch: 1000/1000, loss: 0.872249, (IMP) 0.868744, (SPA) 1.0 x 0.003506: 100%|██████████| 1000/1000 [00:40<00:00, 24.55it/s]


fold 4, median correlation: 0.2305980252076621 (TransImpSpa)


[TransImp] Epoch: 1000/1000, loss: 0.921631, (IMP) 0.921631, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:13<00:00, 76.57it/s]


fold 4, median correlation: 0.23718580644416634 (TransImpCls)


[TransImp] Epoch: 1000/1000, loss: 0.834083, (IMP) 0.834083, (SPA) 1.0 x 0.000000: 100%|██████████| 1000/1000 [00:17<00:00, 58.33it/s]


fold 4, median correlation: 0.24124146615120312 (TransImp)


[TransImp] Epoch: 1000/1000, loss: 0.932727, (IMP) 0.927221, (SPA) 1.0 x 0.005507: 100%|██████████| 1000/1000 [00:35<00:00, 28.11it/s]


fold 4, median correlation: 0.22591238449440773 (TransImpClsSpa)
			0.19697952322394793 (stPlus)
			0.18588047562777527 (spaGE)


INFO:root:281 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:281 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 281 genes and rna_count_based density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.380, KL reg: 0.063
Score: 0.561, KL reg: 0.001
Score: 0.562, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001
Score: 0.563, KL reg: 0.001


INFO:root:Saving results..


			0.23447820100127753 (Tangram)
0.21232646691150722 (TransImpSpa) 0.21878445985173453 (TransImp) 0.20985943358857923 (TransImpCls) 0.20297103645067224 (TransImpClsSpa) 0.16653529498413613 (stPlus) 0.1514051289362372 (spaGE) 0.2190322064989254 (Tangram)


In [6]:
df_transImp.to_csv('../../output/seqFISH_SingleCell_transImpute.csv')
df_transImpSpa.to_csv('../../output/seqFISH_SingleCell_transImpSpa.csv')
df_transImpCls.to_csv('../../output/seqFISH_SingleCell_transImpCls.csv')
df_transImpClsSpa.to_csv('../../output/seqFISH_SingleCell_transImpClsSpa.csv')
df_spaGE_res.to_csv('../../output/seqFISH_SingleCell_spaGE.csv')
df_stplus_res.to_csv('../../output/seqFISH_SingleCell_stPlus.csv')
df_tangram_res.to_csv('../../output/seqFISH_SingleCell_Tangram.csv')


In [9]:
dict_df = {"TransImp":df_transImp, 
           "TransImpSpa":df_transImpSpa, 
           "TransImpCls":df_transImpCls,
           "TransImpClsSpa":df_transImpClsSpa,
           "spaGE": df_spaGE_res, "stPlus": df_stplus_res,
            "Tangram":df_tangram_res
            }
spa_adata.X = spa_adata.X.toarray()
sq.gr.spatial_autocorr(
    spa_adata,
    n_jobs=10,
)
sq.gr.spatial_autocorr(
    spa_adata,
    n_jobs=10,
    mode='geary',
)

dict_adata = {name: compute_autocorr(spa_adata.copy(), df) for name, df in dict_df.items()}


In [10]:
from sklearn.metrics import mean_squared_error
moranIs = {name:mean_squared_error(spa_adata.uns['moranI'].loc[raw_shared_gene].I, imp_adata.uns['moranI'].loc[raw_shared_gene].I) for name, imp_adata in dict_adata.items()}
gearyCs = {name:mean_squared_error(spa_adata.uns['gearyC'].loc[raw_shared_gene].C, imp_adata.uns['gearyC'].loc[raw_shared_gene].C) for name, imp_adata in dict_adata.items()}

print("Mean Squared Error\nMoran's I:\n")
print("\n".join([f"\tTrue vs {method}: {score:.6f}" for method, score in moranIs.items()]))
print("Geary's C:\n")
print("\n".join([f"\tTrue vs {method}: {score:.6f}" for method, score in gearyCs.items()]))


Mean Squared Error
Moran's I:

	True vs TransImp: 0.057495
	True vs TransImpSpa: 0.009693
	True vs TransImpCls: 0.049904
	True vs TransImpClsSpa: 0.007556
	True vs spaGE: 0.034764
	True vs stPlus: 0.051680
	True vs Tangram: 0.037711
Geary's C:

	True vs TransImp: 0.056753
	True vs TransImpSpa: 0.009619
	True vs TransImpCls: 0.049218
	True vs TransImpClsSpa: 0.007508
	True vs spaGE: 0.034072
	True vs stPlus: 0.051366
	True vs Tangram: 0.037165
