In [2]:
import scanpy as sc
import pandas as pd
import numpy as np

# Load Visium data

In [3]:
adata = sc.read_visium('./1_Human_Breast_Cancer/')
adata.var_names_make_unique()

  utils.warn_names_duplicates("var")


In [4]:
adata.layers['counts'] = adata.X.copy()

In [5]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True, percent_top=None, log1p=False)

sc.pp.filter_cells(adata, min_counts=500)
adata = adata[adata.obs["pct_counts_mt"] < 20]

sc.pp.normalize_total(adata, inplace=True)

  view_to_actual(adata)


# Cell type prediction : CellDART

https://github.com/mexchy1000/CellDART

### Load scRNAseq reference

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE148673

In [6]:
ref = sc.read_h5ad('BRCA_GSE148673/BRCA_GSE148673_processed.h5ad')

ref.X = ref.layers['counts']

ref.var_names_make_unique()

sc.pp.normalize_total(ref, inplace=True)
sc.tl.rank_genes_groups(ref, 'celltype', method='wilcoxon')

In [7]:
genelists = ref.uns['rank_genes_groups']['names']
df_genelists = pd.DataFrame.from_records(genelists)
df_genelists.head(5) # Top5 markers of each celltypes

num_markers = 20
res_genes = []
for column in df_genelists.head(num_markers): 
    res_genes.extend(df_genelists.head(num_markers)[column].tolist())
res_genes_ = list(set(res_genes)) 

# CellDART

In [8]:
from CellDART import da_cellfraction
from CellDART.utils import random_mix

In [9]:
def log_minmaxscale(arr):
    arrd = len(arr)
    arr = np.log1p(arr)
    e = 1e-8
    return (arr-np.reshape(np.min(arr,axis=1), (arrd,1)))/np.reshape((np.max(arr, axis=1)-np.min(arr,axis=1))+e,(arrd,1))

In [10]:
# Define intersection genes
inter_genes = list(set(adata.var.index).intersection(set(res_genes)))
mat_sp = adata[:,inter_genes].X.todense()
mat_sc = ref[:,inter_genes]
mat_sc = mat_sc.X.todense()
df_sc = ref.obs
lab_sc_sub = df_sc.celltype

sc_sub_dict = dict(zip(range(len(set(lab_sc_sub))), set(lab_sc_sub)))
sc_sub_dict2 = dict((y,x) for x,y in sc_sub_dict.items())
lab_sc_num = [sc_sub_dict2[ii] for ii in lab_sc_sub]
lab_sc_num = np.asarray(lab_sc_num, dtype='int')

# Generate mixture 
sc_mix, lab_mix = random_mix(mat_sc, lab_sc_num, nmix=8, n_samples=20000)
sc_mix_s = log_minmaxscale(sc_mix)
mat_sp_s = log_minmaxscale(mat_sp)
mat_sc_s = log_minmaxscale(mat_sc)
print(mat_sp_s.shape, mat_sc_s.shape, sc_mix_s.shape)

(2516, 137) (10359, 137) (20000, 137)


In [11]:
embs, clssmodel = da_cellfraction.train(sc_mix_s, lab_mix, mat_sp_s, enable_dann=True,
                                        alpha=1, alpha_lr=5, emb_dim = 64, batch_size = 512,
                                        n_iterations = 2000,
                                        initial_train=True,
                                        initial_train_epochs=10)

pred_sp = clssmodel.predict(mat_sp_s)

Train on 20000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
initial_train_done


  updates = self.state_updates


Iteration 99, source loss =  1.963, discriminator acc = 0.010
Iteration 199, source loss =  0.781, discriminator acc = 0.225
Iteration 299, source loss =  1.080, discriminator acc = 0.000
Iteration 399, source loss =  0.416, discriminator acc = 0.170
Iteration 499, source loss =  0.408, discriminator acc = 0.058
Iteration 599, source loss =  0.290, discriminator acc = 0.001
Iteration 699, source loss =  0.248, discriminator acc = 0.317
Iteration 799, source loss =  0.304, discriminator acc = 0.341
Iteration 899, source loss =  0.244, discriminator acc = 0.018
Iteration 999, source loss =  0.207, discriminator acc = 0.616
Iteration 1099, source loss =  0.199, discriminator acc = 0.481
Iteration 1199, source loss =  0.197, discriminator acc = 0.009
Iteration 1299, source loss =  0.188, discriminator acc = 0.614
Iteration 1399, source loss =  0.182, discriminator acc = 0.752
Iteration 1499, source loss =  0.167, discriminator acc = 0.192
Iteration 1599, source loss =  0.163, discriminator

  updates=self.state_updates,


In [12]:
for num in sc_sub_dict:
    adata.obs[f'celltype_{sc_sub_dict[num]}'] = pred_sp[:,num]

# Export

In [13]:
sc.pp.log1p(adata)

In [14]:
adata.uns['preprocessed'] = True

In [25]:
adata.write_h5ad('./10X_Visium_FFPE_Human_Breast_Cancer.h5ad')