In [None]:
import pandas as pd
import os
import numpy as np
import random
import sys
sys.path.append('/home/graduates/cyn/Cell2Map')
from Cell2Map import common
import matplotlib.pyplot as plt
import scanpy as sc
import sys

from Cell2Map import map_utils
from Cell2Map import autoencoder
from Cell2Map import utils
import anndata as ad

# Load data

Read dataset

In [54]:
scRNA_path="../data/MI/scfile.txt"
cell_type_path="../data/MI/sclab.txt"
st_path="../data/MI/spatial.h5ad"
coordinates_path=None
output_folder="../result/MI/"
mean_cell_numbers=5

In [55]:
celltype_col='ct'

# Read the data from the specified paths
# sc_adata: single-cell RNA-seq data
# spatial_adata: spatial transcriptomics data
# cell_type_number_eachspot_data: estimated number of each cell type in each spot
# coordinates_data: coordinates of each spot
sc_adata, spatial_adata, coordinates_data =\
        common.read_data(scRNA_path, cell_type_path, st_path, coordinates_path,celltype_col)

# Extract the cell type information from the single-cell data
# Rename the index and the 'CellType' column for clarity
cell_type=pd.DataFrame(sc_adata.obs[celltype_col])
cell_type.index = [str(idx)[5:] for idx in cell_type.index]
cell_type['CellType'] = [str(idx)[5:] for idx in cell_type[celltype_col]]

# estimate the number of cells in each spot
cell_number_to_node_assignment=common.estimate_cell_number_RNA_reads(spatial_adata.to_df().T, int(mean_cell_numbers))

In [56]:
sc_adata,spatial_adata=map_utils.process_adatas(sc_adata,spatial_adata)

  view_to_actual(adata)


In [57]:
print(sc_adata)
print(spatial_adata)

AnnData object with n_obs × n_vars = 4486 × 15053
    obs: 'ct', 'CellType'
    var: 'n_cells'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
AnnData object with n_obs × n_vars = 1940 × 15053
    obs: 'in_tissue', 'array_row', 'array_col', 'rna_count_based_density'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'
    uns: 'spatial', 'log1p'
    obsm: 'spatial'


In [58]:
sc_adata,spatial_adata=autoencoder.embedding_feature(sc_adata,spatial_adata,k_cutoff=5)

The graph contains 35326 edges, 4486 cells.
7.8747213553276865 neighbors per cell on average.
The graph contains 0 edges, 1940 cells.
0.0 neighbors per cell on average.


100%|██████████| 500/500 [00:12<00:00, 39.12it/s]


In [59]:
Deep_map=map_utils.map_cell_to_space(sc_adata,spatial_adata,
learning_rate=0.003,
    num_epochs=5000,
    lambda_d=0.9,
    lambda_g1=2,
    lambda_g2=1,
    b_init=cell_number_to_node_assignment,
    alpha=3.5,
    device='cuda:0')

  self.d=torch.tensor(d,device=device, dtype=torch.float32)


seed:1000
Total_loss: 23.391, gv_term: 1.073, vg_term: 0.725, density_term: 6.814, expression_term: 18.867, distance_term:0.493,
Total_loss: 23.233, gv_term: 1.127, vg_term: 0.727, density_term: 6.817, expression_term: 18.756, distance_term:0.487,
Total_loss: 23.104, gv_term: 1.151, vg_term: 0.728, density_term: 6.827, expression_term: 18.642, distance_term:0.485,
Total_loss: 22.983, gv_term: 1.164, vg_term: 0.729, density_term: 6.843, expression_term: 18.522, distance_term:0.488,
Total_loss: 22.868, gv_term: 1.168, vg_term: 0.730, density_term: 6.863, expression_term: 18.397, distance_term:0.494,
Total_loss: 22.757, gv_term: 1.164, vg_term: 0.731, density_term: 6.887, expression_term: 18.269, distance_term:0.504,
Total_loss: 22.652, gv_term: 1.155, vg_term: 0.732, density_term: 6.914, expression_term: 18.140, distance_term:0.515,
Total_loss: 22.551, gv_term: 1.141, vg_term: 0.733, density_term: 6.942, expression_term: 18.010, distance_term:0.526,
Total_loss: 22.452, gv_term: 1.126, vg

In [60]:
sc_sp_map_df = pd.DataFrame(Deep_map.X, index=sc_adata.obs_names, columns=spatial_adata.obs_names)


In [61]:
# Remove the first 5 characters from the column and index names of the DataFrame
sc_sp_map_df.columns = [str(col)[5:] for col in sc_sp_map_df.columns]
sc_sp_map_df.index = [str(idx)[5:] for idx in sc_sp_map_df.index]

# Initialize an empty dictionary to store the results
result_dict = {}

# For each column in the DataFrame, find the cell names with the highest values
# The number of cell names to find is determined by cell_number_to_node_assignment
# Store the results in the result_dict dictionary
for i, column in enumerate(sc_sp_map_df.columns.values):
    result_dict[column] = sc_sp_map_df[column].nlargest(cell_number_to_node_assignment[i]).index

# Convert the result_dict dictionary to a DataFrame
# The 'Predict' column contains the spot names, and the 'Values' column contains the cell names
# The 'Values' column is exploded so that each cell name has its own row
max_column_names = pd.DataFrame({'Predict': list(result_dict.keys()), 'Values': [list(index_obj) for index_obj in result_dict.values()]}).explode('Values')

# Convert the cell names in the 'Values' column to strings
max_column_names['Values'] = max_column_names['Values'].astype(str)

# Set the 'Values' column as the index of the DataFrame
max_column_names = max_column_names.set_index('Values')
max_column_names=max_column_names.assign(CellType='type')

# Remove rows where the index is "nan"
max_column_names = max_column_names[max_column_names.index!="nan"]

# For each row in the DataFrame, replace 'type' in the 'CellType' column with the actual cell type from the cell_type DataFrame
for i,cell_id in enumerate(max_column_names.index.values):
    max_column_names.iloc[i,-1]=cell_type.loc[cell_id][0]

# Reindex the coordinates_data DataFrame based on the 'Predict' column of the max_column_names DataFrame
# This gives the predicted coordinates for each cell
predict_coordinates=coordinates_data.reindex(max_column_names['Predict'])

# Add the predicted x and y coordinates to the max_column_names DataFrame
max_column_names['predict_x']=predict_coordinates['X'].values
max_column_names['predict_y']=predict_coordinates['Y'].values

# Print the DataFrame
print(max_column_names)

  max_column_names.iloc[i,-1]=cell_type.loc[cell_id][0]


                               Predict CellType  predict_x  predict_y
Values                                                               
CAGCCGAGTATGAAAC.1  AAACAAGTATCTCCCA-1       CM       9172       7832
ACGGGCTCAGTACACT.1  AAACAAGTATCTCCCA-1       CM       9172       7832
TACACGAGTATAGGTA.1  AAACAAGTATCTCCCA-1       CM       9172       7832
GCATGCGGTCAATGTC.1  AAACAAGTATCTCCCA-1       CM       9172       7832
CAGCAGCGTTGACGTT.1  AAACAAGTATCTCCCA-1       CM       9172       7832
...                                ...      ...        ...        ...
CAGGTGCCAAGTCTAC.1  TTGTTTCACATCCAGG-1       CM       4532       8920
ACGGCCAAGACGCTTT.1  TTGTTTCCATACAACT-1     ENDO       3367       7173
CAAGAAATCAACGAAA.1  TTGTTTCCATACAACT-1       CM       3367       7173
CATGCCTCATGGTCTA.1  TTGTTTCCATACAACT-1       CM       3367       7173
CCTTCGAGTACCGCTG.1  TTGTTTCCATACAACT-1       CM       3367       7173

[8721 rows x 4 columns]


In [None]:
assigned_locations_path = os.path.join(output_folder+"Cell2Map_MI.csv")
max_column_names.to_csv(assigned_locations_path)