In [1]:
import numpy as np
import scanpy as sc
import h5py
import os
from scbig.utils import read_data, setup_seed,louvain,calculate_metric
import warnings
warnings.filterwarnings('ignore')
import simba as si
import shutil

Using backend: pytorch


In [2]:
#shutil.rmtree('result_simba_rnaseq')  

In [3]:
workdir = 'result_simba_rnaseq'
si.settings.set_workdir(workdir)
si.settings.set_figure_params(dpi=80,style='white',fig_size=[5,5],rc={'image.cmap': 'viridis'})

Saving results in: result_simba_rnaseq


In [4]:
setup_seed(0)

In [5]:
datasets=['10X_PBMC','mouse_bladder_cell','mouse_ES_cell','human_kidney_counts','Adam','Human_pancreatic_islets','Macosko_mouse_retina']

In [6]:
dataset = datasets[0]
print('----------------real data: {} ----------------- '.format(dataset))
method = 'SIMBA'
dir0 = '../'
dir1 = '{}'.format(dataset)

----------------real data: 10X_PBMC ----------------- 


In [7]:
if dataset in ['Adam']:
    mat, obs, var, uns = read_data(os.path.join(dir0, 'datasets/real/{}.h5'.format(dataset)), sparsify=False,
                                   skip_exprs=False)
    X = np.array(mat.toarray())
    cell_name = np.array(obs["cell_type1"])
    cell_type, cell_label = np.unique(cell_name, return_inverse=True)
    Y = cell_label
else:
    with h5py.File(os.path.join(dir0, 'datasets/real/{}.h5'.format(dataset))) as data_mat:
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])
        X = np.ceil(X).astype(np.int_)
        Y = np.array(Y).astype(np.int_).squeeze()

adata = sc.AnnData(X.astype('float'))
adata.obs['cl_type'] = Y
n_clusters = len(np.unique(Y))
print(adata)

AnnData object with n_obs × n_vars = 4271 × 16653
    obs: 'cl_type'


In [8]:
si.pp.filter_genes(adata, min_n_cells=3)
si.pp.filter_cells_rna(adata, min_n_genes=100)
si.pp.normalize(adata, method='lib_size')
si.pp.log_transform(adata)
print(adata)

Before filtering: 
4271 cells, 16653 genes
Filter genes based on min_n_cells
After filtering out low-expressed genes: 
4271 cells, 16486 genes
before filtering: 
4271 cells,  16486 genes
filter cells based on min_n_genes
after filtering out low-quality cells: 
4271 cells,  16486 genes
AnnData object with n_obs × n_vars = 4271 × 16486
    obs: 'cl_type', 'n_counts', 'n_genes', 'pct_genes'
    var: 'n_counts', 'n_cells', 'pct_cells'
    layers: 'raw'


In [9]:
si.tl.discretize(adata, n_bins=5)
si.tl.gen_graph(list_CG=[adata],use_highly_variable=False,dirname='graph0')

relation0: source: C, destination: G
#edges: 2342675
relation1: source: C, destination: G
#edges: 1984106
relation2: source: C, destination: G
#edges: 689952
relation3: source: C, destination: G
#edges: 313860
relation4: source: C, destination: G
#edges: 190589
Total number of edges: 5521182
Writing graph file "pbg_graph.txt" to "result_simba_rnaseq\pbg\graph0" ...
Finished.


In [10]:
dict_config = si.settings.pbg_params.copy()
dict_config['dimension'] = 64
si.tl.pbg_train(pbg_params=dict_config, auto_wd=True, save_wd=True, output='model')

Auto-estimated weight decay is 0.006418
`.settings.pbg_params['wd']` has been updated to 0.006418
Converting input data ...
[2023-08-27 20:26:57.262595] Using the 5 relation types given in the config
[2023-08-27 20:26:57.262595] Searching for the entities in the edge files...
[2023-08-27 20:27:04.874813] Entity type C:
[2023-08-27 20:27:04.874813] - Found 4271 entities
[2023-08-27 20:27:04.875813] - Removing the ones with fewer than 1 occurrences...
[2023-08-27 20:27:04.876814] - Left with 4271 entities
[2023-08-27 20:27:04.876814] - Shuffling them...
[2023-08-27 20:27:04.879816] Entity type G:
[2023-08-27 20:27:04.880815] - Found 16486 entities
[2023-08-27 20:27:04.880815] - Removing the ones with fewer than 1 occurrences...
[2023-08-27 20:27:04.883817] - Left with 16486 entities
[2023-08-27 20:27:04.883817] - Shuffling them...
[2023-08-27 20:27:04.894818] Preparing counts and dictionaries for entities and relation types:
[2023-08-27 20:27:04.896818] - Writing count of entity type C a

In [11]:
dict_adata = si.read_embedding()
print(dict_adata)

{'C': AnnData object with n_obs × n_vars = 4271 × 64, 'G': AnnData object with n_obs × n_vars = 16486 × 64}


In [12]:
adata_C = dict_adata['C']
adata_C.obs_names = adata_C.obs_names.astype('str')
adata.obsm['feat'] = np.array(adata_C[adata.obs_names,:].X)
adata.obsm['feat'].shape

(4271, 64)

In [13]:
 # louvain
adata = louvain(adata, resolution=1, use_rep='feat')
y_pred_l = np.array(adata.obs['louvain'])
n_pred = len(np.unique(y_pred_l))
nmi_l, ari_l = np.round(calculate_metric(Y, y_pred_l), 4)
print('Clustering Louvain: NMI= %.4f, ARI= %.4f' % (nmi_l, ari_l))

Clustering Louvain: NMI= 0.7695, ARI= 0.7549


In [14]:
sc.tl.umap(adata)
print(adata)

np.savez(os.path.join(dir0,"results/visualization/{}/record_{}_{}.npz".format(dataset,dataset,method)),
         ari=ari_l, nmi=nmi_l,
         umap=adata.obsm['X_umap'],
         true=np.array(adata.obs['cl_type'].values.astype(int)),
         louvain=np.array(adata.obs['louvain'].values.astype(int)))

print(nmi_l)
print(ari_l)
print(n_pred)

AnnData object with n_obs × n_vars = 4271 × 16486
    obs: 'cl_type', 'n_counts', 'n_genes', 'pct_genes', 'pbg_id', 'louvain'
    var: 'n_counts', 'n_cells', 'pct_cells', 'pbg_id'
    uns: 'disc', 'neighbors', 'louvain', 'umap'
    obsm: 'feat', 'X_umap'
    layers: 'raw', 'disc'
    obsp: 'distances', 'connectivities'
0.7695
0.7549
9
