# 0. Import libraries

In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
from pyStrint.strint import strInt
from pyStrint import preprocess as pp

2025-03-27 14:52:49.926375: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-27 14:52:49.928591: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-27 14:52:49.976257: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-27 14:52:49.977798: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 1. Load files

In [2]:
inputDir = './demo/'
outDir = f'{inputDir}/strint_results/'
if not os.path.exists(outDir):
    os.makedirs(outDir)
sc_exp = pd.read_csv(f'{inputDir}/SC_exp.tsv',sep = '\t',header=0,index_col=0)
sc_meta = pd.read_csv(f'{inputDir}/SC_meta.tsv',sep = '\t',header=0,index_col=0)
st_exp = pd.read_csv(f'{inputDir}/ST_exp.tsv',sep = '\t',header=0,index_col=0)
st_coord = pd.read_csv(f'{inputDir}/ST_coord.tsv',sep = '\t',header=0,index_col=0)
st_decon = pd.read_csv(f'{inputDir}/ST_decon.tsv',sep = '\t',header=0,index_col=0)
sc_distribution = pd.read_csv(f'{inputDir}/simu_sc_distribution.tsv',sep = '\t',header=0,index_col=0)

In [3]:
# Append the addition lrdf to the our lrdb
species  = 'Human'
lr_list = pd.read_csv(f'{inputDir}/lr_df.csv',sep=',',header=0,index_col=0)
lr_db = pp.load_lr_df(species = species)
lr_list.columns = lr_db.columns
lr_db = pd.concat((lr_db,lr_list),axis = 0)
lr_db.index = list(range(len(lr_db)))

# 2. Get cell model

In [6]:
import smurf
operator = smurf.SMURF(n_features=15, estimate_only=True)
sc_distribution = operator.smurf_impute(sc_exp.T).T

Running SCEnd on 2131 cells and 3247 genes
normalizing data by library size...
preprocessing data...
number of iteration:  1 / 10
number of iteration:  2 / 10
number of iteration:  3 / 10
number of iteration:  4 / 10
number of iteration:  5 / 10
number of iteration:  6 / 10
number of iteration:  7 / 10
number of iteration:  8 / 10
number of iteration:  9 / 10
number of iteration:  10 / 10


In [7]:
sc_distribution.to_csv(f'{outDir}SC_smurf.tsv',sep = '\t',header=True,index=True)

In [None]:
# or you can load the precomputed results
sc_distribution = pd.read_csv(f'{outDir}SC_smurf.tsv',sep = '\t',header=0,index_col=0)

# 3. Preprocess

In [4]:
sc_adata, st_adata, sc_ref, lr_df = pp.prep_all_adata(sc_exp = sc_exp, st_exp = st_exp, sc_distribution = sc_distribution,
                                                    sc_meta = sc_meta, st_coord = st_coord, lr_df = lr_db, SP = species)

Data clean is done! Using 3244 shared genes .


# 4. Parameters

In [5]:
st_tp = 'st'
delta, eta = [0.1, 0.0005]
num_per_spot = 6
repeat_penalty = 10
# max_rep for gradient descent, choose accordingly.
max_rep = 10

In [6]:
expected_cell_num = st_adata.shape[0] *num_per_spot
ref_cell_num = sc_adata.shape[0]

In [7]:
p1, p2, p3, p4 = pp.auto_tune_parameters(expected_cell_num/ref_cell_num, len(lr_df)/len(lr_db)) 
print(p1, p2, p3, p4)

0.05 0.68 0.17 0.1


# 5. Cell selection

In [10]:
obj_spex = strInt(save_path = outDir, st_adata = st_adata, weight = st_decon,
                    sc_ref = sc_ref, sc_adata = sc_adata, cell_type_key = 'celltype', lr_df = lr_df,
                    st_tp = st_tp,species = species)
obj_spex.prep()


The cell index of sc_ref is not str, changed to str for consistency.
sc_ref and sc_adata has different genes, both data are subset to 3244 genes.
Parameters checked!
By setting k as 6, each spot has average 5.80 neighbors.


## use strint for cell selection

In [None]:
sc_agg_meta = obj_spex.select_cells(p = 0.1, mean_num_per_spot = num_per_spot, repeat_penalty = repeat_penalty)

In [25]:
sc_agg_meta.to_csv(f'{outDir}/cell_mapping_meta.tsv',sep = '\t',header=True,index=True)

## load the precomputed results from strint or other methods


In [11]:
sc_agg_meta = pd.read_csv(f'{outDir}/cell_mapping_meta.tsv',sep = '\t',header=0,index_col=0)
user_sc_exp = sc_exp.loc[sc_agg_meta['sc_id']]

In [12]:
sc_agg_meta = obj_spex.select_cells(user_sc_exp = user_sc_exp, user_sc_agg_meta = sc_agg_meta)

# 6. Refinement process

In [14]:
print(p1, p2, p3, p4)

0.05 0.68 0.17 0.1


In [15]:
refined_sc_exp, sc_agg_meta = obj_spex.gradient_descent(
                p1 = p1, p2 = p2, p3 = p3, p4 = p4, 
                delta = delta, eta = eta, 
                init_sc_embed = False,
                iteration = max_rep, k = 2, W_HVG = 2,
                left_range = 7, right_range = 8, steps = 1, dim = 2)
sc_agg_meta.to_csv(f'{outDir}/cell_mapping_meta.tsv',sep = '\t',header=True,index=True)

Init sc_coord by affinity embedding...
Avearge shape correlation is: 0.883642643574808
Hyperparameters adjusted.
-----Start iteration 0 -----
Avearge shape correlation is: 0.883642643574808
The total loss after iteration 0 is 4.20348.
-----Start iteration 1 -----
Avearge shape correlation is: 0.8888774758541574
The total loss after iteration 1 is 4.21085.
-----Start iteration 2 -----
Avearge shape correlation is: 0.8909572245455412
The total loss after iteration 2 is 4.22284.
-----Start iteration 3 -----
Avearge shape correlation is: 0.8893506418851791
The total loss after iteration 3 is 4.20726.
-----Start iteration 4 -----
Avearge shape correlation is: 0.8587359508031193
The total loss after iteration 4 is 4.20025.
-----Start iteration 5 -----
Avearge shape correlation is: 0.8914543917889373
The total loss after iteration 5 is 4.19616.
-----Start iteration 6 -----
Avearge shape correlation is: 0.8964806112216385
The total loss after iteration 6 is 4.20275.
-----Start iteration 7 ----