In [None]:
%load_ext autoreload
%autoreload 2

import scplode as sc
import time
import tracemalloc
import numpy as np
import random
from pathlib import Path
import anndata as ad
from tqdm.auto import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Example Benchmarks

Here we compare scplode with anndata backed in terms of speed and memory usage. 
Some rationale for the choices in comparison:
- anndata backed was used because most of these single cell datasets are too big for loading everything to memory (time and memory cost). 
- a downstream processing step (taking the mean) was used to make sure we're actually "pulling" the data in a realistic way for data exploration.
- One additional feature to note, CSR matrices accessed using anndata backed have limitations in terms of downstream processing (can't do .var directly on it). However scplode currently doesn't use csr, so all your numpy array functionality continues to exist. 

#### Notes
1. You will need more packages than what is required to install scplode to test these benchmarking scripts. These packages were NOT included in the package requirements because they are not necessary for actually using scplode. 


In [None]:
#Download Competition Support Set
"https://storage.googleapis.com/vcc_data_prod/datasets/state/competition_support_set.zip"

#Set the downloaded path below. 

In [78]:
#1st one is a CSR MATRIX
h5ad_path = Path("/Volumes/T7/vcc_pp/data/raw/competition_support_set/competition_train.h5") 
#2nd one is not sparse
#h5ad_path = Path("/Volumes/T7/vcc_pp/data/raw/competition_support_set/k562_gwps.h5")

In [79]:
scadata= sc.read_h5ad(h5ad_path)

[INFO] Loading index: obs
[INFO] Loading index: var
[INFO] Loading index: dat (implicitly)


In [80]:
adata_backed = ad.read_h5ad(h5ad_path, backed = 'r')

In [81]:
%%time 

random.seed(0)
peaks = list()
for repeat in tqdm(range(0,5)):
    tracemalloc.start()
    indices = random.sample(list(adata_backed.obs.index), 5000)
    result = adata_backed[indices].X.mean(axis=0)
    
    #Assess peak memory
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    peak = peak / 1024**2  # MB
    peaks.append(peak)
print(f"peak memory: {np.mean(peaks):.2f} MB")

  0%|          | 0/5 [00:00<?, ?it/s]

peak memory: 999.19 MB
CPU times: user 7.61 s, sys: 10.9 s, total: 18.5 s
Wall time: 57.2 s


In [82]:
%%time 

random.seed(0)
peaks = list()
for repeat in tqdm(range(0,5)):
    tracemalloc.start()
    indices = random.sample(list(scadata.obs.index), 5000)
    result = scadata.get(indices).mean(axis=0)
    
    #Assess peak memory
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    peak = peak / 1024**2  # MB
    peaks.append(peak)
print(f"peak memory: {np.mean(peaks):.2f} MB")

  0%|          | 0/5 [00:00<?, ?it/s]

peak memory: 346.70 MB
CPU times: user 1.63 s, sys: 2.51 s, total: 4.14 s
Wall time: 27.4 s


In [74]:
%%time 

random.seed(0)
peaks = list()
for repeat in tqdm(range(0,5)):
    tracemalloc.start()
    index_low = random.randint(1, adata_backed.obs.shape[0]-10000)  
    result = adata_backed[index_low: index_low+10000].X.mean(axis=0)
    
    #Assess peak memory
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    peak = peak / 1024**2  # MB
    peaks.append(peak)
print(f"peak memory: {np.mean(peaks):.2f} MB")

  0%|          | 0/5 [00:00<?, ?it/s]

peak memory: 690.06 MB
CPU times: user 422 ms, sys: 2.18 s, total: 2.6 s
Wall time: 2.59 s


In [75]:
%%time 

random.seed(0)
peaks = list()
for repeat in tqdm(range(0,5)):
    tracemalloc.start()
    index_low = random.randint(1, scadata.obs.shape[0]-10000)  
    
    result = scadata.get(range(index_low, index_low+10000)).mean(axis=0)
    
    #Assess peak memory
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    peak = peak / 1024**2  # MB
    peaks.append(peak)
print(f"peak memory: {np.mean(peaks):.2f} MB")

  0%|          | 0/5 [00:00<?, ?it/s]

peak memory: 689.89 MB
CPU times: user 1.31 s, sys: 1.15 s, total: 2.45 s
Wall time: 2.45 s


### Conclusion

The speed and memory advantage of scplode differs depending on context. 
For random access, scplode has a ~2x speed advantage and 3x memory decrease.
Not much difference seen with contiguous index access.