In [1]:
import numpy as np
import scanpy as sc
import h5py
import os
from scbig.utils import setup_seed
import warnings
warnings.filterwarnings('ignore')
import simba as si
from simba.tools._pbg import pbg_train
import shutil
import time
from memory_profiler import profile

Using backend: pytorch


In [2]:
#shutil.rmtree('result_simba') 
workdir = 'result_simba'
si.settings.set_workdir(workdir)
si.settings.set_figure_params(dpi=80,style='white',fig_size=[5,5],rc={'image.cmap': 'viridis'})

Saving results in: result_simba


In [3]:
@profile
def run_simba():
    from memory_profiler import memory_usage
    pbg_train(auto_wd=True, save_wd=False, output='model')
    mem_used = memory_usage(-1, interval=.1, timeout=1)
    print(max(mem_used))
    return max(mem_used)

In [4]:
for dataset in ['2000','4000','8000','16000','32000','64000']:
    print('----------------real data: {} ----------------- '.format(dataset))
    setup_seed(0)
    method = 'SIMBA'
    dir0 = '../'
    dir1 = '{}'.format(dataset)

    with h5py.File(os.path.join(dir0, 'datasets/time/data_cell{}.h5'.format(dataset))) as data_mat:
        X = np.array(data_mat['X'])
        Y = np.array(data_mat['Y'])
        X = np.ceil(X).astype(np.int_)
        Y = np.array(Y).astype(np.int_).squeeze()

    adata = sc.AnnData(X.astype('float'))
    adata.obs['cl_type'] = Y
    n_clusters = len(np.unique(Y))
    si.pp.filter_genes(adata, min_n_cells=3)
    si.pp.filter_cells_rna(adata, min_n_genes=100)
    si.pp.normalize(adata, method='lib_size')
    si.pp.log_transform(adata)
    print(adata)
    si.tl.discretize(adata, n_bins=5)
    si.tl.gen_graph(list_CG=[adata],use_highly_variable=False,dirname='graph0')

    start_time = time.time()
    #train
    memory_usage=run_simba()
    end_time = time.time()
    total_time = end_time - start_time
    print("Run Done. Total Running Time: %s seconds" %(total_time))

    np.savez(os.path.join(dir0, "results/time_memory/{}/record_cell{}_{}.npz".format(dataset, dataset, method)),
             time=total_time, memory_usage=memory_usage)

----------------real data: 2000 ----------------- 
Before filtering: 
2000 cells, 10000 genes
Filter genes based on min_n_cells
After filtering out low-expressed genes: 
2000 cells, 9244 genes
before filtering: 
2000 cells,  9244 genes
filter cells based on min_n_genes
after filtering out low-quality cells: 
2000 cells,  9244 genes
AnnData object with n_obs × n_vars = 2000 × 9244
    obs: 'cl_type', 'n_counts', 'n_genes', 'pct_genes'
    var: 'n_counts', 'n_cells', 'pct_cells'
    layers: 'raw'
relation0: source: C, destination: G
#edges: 441649
relation1: source: C, destination: G
#edges: 460077
relation2: source: C, destination: G
#edges: 273163
relation3: source: C, destination: G
#edges: 271925
relation4: source: C, destination: G
#edges: 189004
Total number of edges: 1635818
Writing graph file "pbg_graph.txt" to "result_simba\pbg\graph0" ...
Finished.
ERROR: Could not find file <ipython-input-3-ce685a63275c>
NOTE: %mprun can only be used on functions defined in physical files, and