# Leaf variables

In [100]:
from IPython import get_ipython

if get_ipython():
    get_ipython().run_line_magic("load_ext", "autoreload")
    get_ipython().run_line_magic("autoreload", "2")

import latenta as la
import scanpy as sc
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
cells = la.Dim(10000, "cell")
genes = la.Dim(1000, "gene")
definition = la.Definition([cells, genes])

In [106]:
import tempfile

In [116]:
tmpdir = tempfile.TemporaryDirectory().name
pathlib.Path(tmpdir).mkdir()

In [121]:
filesize

3.2922792434692383

## Loaders

In [102]:
value = np.zeros(definition.shape)
for i in range(value.shape[0]):
    value[i, np.random.choice(value.shape[1])] = 1.0

In [103]:
value.sum()

10000.0

### Memory loader

In [87]:
loader = la.loaders.MemoryLoader(value)
fixed = la.Fixed(loader, definition=la.Definition([cells, genes]))

### Sparse loader

In [88]:
import sys
from types import ModuleType, FunctionType
from gc import get_referents

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


def getsize(obj):
    """sum size of object & members."""
    if isinstance(obj, BLACKLIST):
        raise TypeError("getsize() does not take argument of type: " + str(type(obj)))
    seen_ids = set()
    size = 0
    objects = [obj]
    while objects:
        need_referents = []
        for obj in objects:
            if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
                seen_ids.add(id(obj))
                size += sys.getsizeof(obj)
                need_referents.append(obj)
            if torch.is_tensor(obj):
                size += obj.element_size() * obj.nelement()
        objects = get_referents(*need_referents)
    return size

Obviously, a large difference in memory usage.

In [89]:
getsize(loader) / 1024**2

3814.6980419158936

Using 3GB (or more) on current GPUs would be prohibitive.

In [90]:
sparse = la.sparse.COOMatrix.from_numpy_array(value)
loader_sparse = la.loaders.SparseMemoryLoader(
    sparse
)
fixed_sparse = la.Fixed(loader_sparse, definition=la.Definition([cells, genes]))

In [91]:
loader_sparse.initialize(fixed_sparse)

In [40]:
getsize(loader_sparse) / 1024**2

17.58980369567871

#### Initialization

There are several ways to initialize the COOMatrix:

In [41]:
la.sparse.COOMatrix.from_numpy_array(value)

COOMatrix(row=tensor([    0,     1,     2,  ..., 99997, 99998, 99999]), col=tensor([1931, 3859, 5623,  ...,  525, 5523, 9345]), values=tensor([1., 1., 1.,  ..., 1., 1., 1.], dtype=torch.float64), shape=(100000, 10000), mapping=None)

In [42]:
value_scipy_coo = scipy.sparse.coo_matrix(value)
la.sparse.COOMatrix.from_scipy_coo(value_scipy_coo)

COOMatrix(row=tensor([    0,     1,     2,  ..., 99997, 99998, 99999]), col=tensor([1931, 3859, 5623,  ...,  525, 5523, 9345]), values=tensor([1., 1., 1.,  ..., 1., 1., 1.], dtype=torch.float64), shape=(100000, 10000), mapping=None)

In [43]:
value_scipy_csr = scipy.sparse.csr_matrix(value)
la.sparse.COOMatrix.from_scipy_csr(value_scipy_csr)

COOMatrix(row=tensor([    0,     1,     2,  ..., 99997, 99998, 99999]), col=tensor([1931, 3859, 5623,  ...,  525, 5523, 9345]), values=tensor([1., 1., 1.,  ..., 1., 1., 1.], dtype=torch.float64), shape=(100000, 10000), mapping=None)

#### Timing

When data is accessed from a SparseMemoryLoader, it is converted to a dense tensor. This can cause a slowdown:

In [44]:
import timeit

In [47]:
time = timeit.timeit("loader.get()", number=1, globals=globals())
time

3.500143066048622e-06

In [48]:
time = timeit.timeit("loader_sparse.get()", number=1, globals=globals())
time

0.7765716260764748

Speed is however much more competitive when taking only a subsample, which is the typical use case anyway when working with large sparse tensors:

In [52]:
idx = {"cell": torch.tensor(range(500))}

In [53]:
time = timeit.timeit("loader.get(idx)", number=1, globals=globals())
time

0.001256783027201891

In [54]:
time = timeit.timeit("loader_sparse.get(idx)", number=1, globals=globals())
time

0.0043448100332170725

### File loader

Supply a path to a pickle. The value will not be saved along with the loader

In [60]:
import pickle
import tempfile
import pathlib
import os

In [61]:
path = pathlib.Path(tempfile.NamedTemporaryFile().name)
pickle.dump(value, path.open("wb"))

In [62]:
filesize = os.path.getsize(path) / 1024**2
print(filesize, "Mb")

7629.3946924209595 Mb


In [None]:
assert filesize > 1024

In [63]:
fixed = la.Fixed(path, definition = la.Definition([cells, genes]))

In [64]:
fixed_path = pathlib.Path(tempfile.NamedTemporaryFile().name)
pickle.dump(fixed, fixed_path.open("wb"))

In [65]:
filesize = os.path.getsize(fixed_path) / 1024**2
print(filesize, "Mb")

2.060612678527832 Mb


In [67]:
assert filesize < 3

### Sparse File Loader

Similarly, we can load a sparse matrix from a file. This is especially useful with larger tensors, i.e. over a billion values (e.g. hundred thousands cells and ten thousand genes)

In [68]:
path = pathlib.Path(tempfile.NamedTemporaryFile().name)
pickle.dump(sparse, path.open("wb"))

In [69]:
filesize = os.path.getsize(path) / 1024**2
print(filesize, "Mb")

30.111458778381348 Mb


In [71]:
assert filesize > 3

In [72]:
fixed = la.Fixed(
    la.loaders.COOMatrixFileLoader(path),
    definition = la.Definition([cells, genes])
)

In [73]:
fixed_path = pathlib.Path(tempfile.NamedTemporaryFile().name)
pickle.dump(fixed, fixed_path.open("wb"))

In [74]:
filesize = os.path.getsize(fixed_path) / 1024**2
print(filesize, "Mb")

2.0606231689453125 Mb


In [78]:
assert filesize < 3

In [79]:
fixed2 = pickle.load(fixed_path.open("rb"))

In [82]:
fixed2.loader.get()

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)