## CuPy Call Rate Benchmark

This is the same as the call rate benchmark notebook on CPU but swaps out numpy for dask chunks with CuPy.

In [1]:
import sys
sys.path.append(".")
from lib import api
from lib.io import plugins
from pathlib import Path
import warnings
import xarray as xr
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import dask
from dask.distributed import performance_report
%run nb/paths.py
xr.set_options(display_style='html');

In [2]:
# Path to PLINK dataset for demonstration
# path = PLINK_HAPMAP_PATH_01
path = PLINK_1KG_PATH_01
path

PosixPath('/lab/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes')

In [2]:
from dask.distributed import Client
client = Client(processes=True, n_workers=8, threads_per_worker=1)
client.register_worker_plugin(plugins.create_dask_codec_plugin())
client

0,1
Client  Scheduler: tcp://127.0.0.1:45787  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 134.78 GB


### Export to Zarr

This step will load the PLINK dataset and write it back out to Zarr for more efficient downstream operations.

In [4]:
%%time
# Load a dataset to work with
ds = api.read_plink(path, chunks='auto', fam_sep=' ', bim_sep='\t')
ds



CPU times: user 3.75 s, sys: 403 ms, total: 4.15 s
Wall time: 26.3 s


Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,241 Tasks,120 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 134.22 MB Shape (25488488, 629) (213382, 629) Count 241 Tasks 120 Chunks Type int8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,241 Tasks,120 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,241 Tasks,120 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 16.03 GB 134.22 MB Shape (25488488, 629) (213382, 629) Count 241 Tasks 120 Chunks Type bool numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,241 Tasks,120 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type object numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type object numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 5 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,5 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 20.86 MB Shape (25488488,) (2607671,) Count 50 Tasks 10 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,20.86 MB
Shape,"(25488488,)","(2607671,)"
Count,50 Tasks,10 Chunks
Type,object,numpy.ndarray


In [5]:
%%time
# TODO: How can we infer fixed dtypes for strings on load?
# TODO: Set consistent chunk sizes on read

# Match bim/fam arrays to chunks in `data`
chunks = dict(zip(['variant', 'sample'], ds.data.data.chunksize)) 
with warnings.catch_warnings():
    # Ignore:
    #    SerializationWarning: variable None has data in the form of a dask array with dtype=object, which means it is being loaded into memory to determine a data type that can be safely stored on disk. To avoid this, coerce this variable to a fixed-size dtype with astype() before saving it.
    warnings.filterwarnings(action='ignore', category=xr.conventions.SerializationWarning)
    # Rechunk so all da.Arrays have same chunk size to avoid:
    #    ValueError: Zarr requires uniform chunk sizes except for final chunk. Variable dask chunks ((2592741, 2590097, 2563466, 2607671, 2595721, 2594521, 2502277, 2513299, 2563743, 2364952),) are incompatible. Consider rechunking using `chunk()`.
    print('Rechunking to:', chunks)
    api.write_zarr(ds.chunk(chunks=chunks), f'/tmp/{path.name}.zarr', mode='w')

Rechunking to: {'variant': 213382, 'sample': 629}
CPU times: user 32.3 s, sys: 3.71 s, total: 36 s
Wall time: 1min 32s


Compare the resulting file sizes, noting that for 1KG the enitre zarr dataset is substantially smaller than the PLINK `.bed` file alone.  This is due to a custom bitpacking filter added by the API as well as default compression options in Zarr.

In [6]:
!du -sh {path}.bed

3.8G	/lab/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes.bed


In [7]:
!du -sh /tmp/{path.name}.zarr

2.4G	/tmp/ALL.2of4intersection.20100804.genotypes.zarr


### Dask + CuPy QC

In [3]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import dask.array as da
import cupy as cp
cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:42533  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 134.78 GB


In [4]:
ds = xr.open_zarr(f'/tmp/{path.name}.zarr')
ds

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.59 MB Shape (25488488,) (199129,) Count 129 Tasks 128 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.59 MB Shape (25488488,) (199129,) Count 129 Tasks 128 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.71 MB Shape (25488488,) (213382,) Count 121 Tasks 120 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.71 MB Shape (25488488,) (213382,) Count 121 Tasks 120 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 134.22 MB Shape (25488488, 629) (213382, 629) Count 121 Tasks 120 Chunks Type int8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 16.03 GB 134.22 MB Shape (25488488, 629) (213382, 629) Count 121 Tasks 120 Chunks Type bool numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.71 MB Shape (25488488,) (213382,) Count 121 Tasks 120 Chunks Type int64 numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.71 MB
Shape,"(25488488,)","(213382,)"
Count,121 Tasks,120 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 5.03 kB 5.03 kB Shape (629,) (629,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",629  1,

Unnamed: 0,Array,Chunk
Bytes,5.03 kB,5.03 kB
Shape,"(629,)","(629,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 203.91 MB 1.59 MB Shape (25488488,) (199129,) Count 129 Tasks 128 Chunks Type object numpy.ndarray",25488488  1,

Unnamed: 0,Array,Chunk
Bytes,203.91 MB,1.59 MB
Shape,"(25488488,)","(199129,)"
Count,129 Tasks,128 Chunks
Type,object,numpy.ndarray


In [5]:
arr = ds.is_masked.data
arr

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 16.03 GB 134.22 MB Shape (25488488, 629) (213382, 629) Count 121 Tasks 120 Chunks Type bool numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,134.22 MB
Shape,"(25488488, 629)","(213382, 629)"
Count,121 Tasks,120 Chunks
Type,bool,numpy.ndarray


In [6]:
arrc = arr.rechunk(block_size_limit='2 Gib')
arrc

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,2.15 GB
Shape,"(25488488, 629)","(3414112, 629)"
Count,129 Tasks,8 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 16.03 GB 2.15 GB Shape (25488488, 629) (3414112, 629) Count 129 Tasks 8 Chunks Type bool numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,2.15 GB
Shape,"(25488488, 629)","(3414112, 629)"
Count,129 Tasks,8 Chunks
Type,bool,numpy.ndarray


In [7]:
da.to_zarr(arrc, '/tmp/arr.zarr', overwrite=True)



In [8]:
arrf = da.from_zarr('/tmp/arr.zarr')
arrf

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,2.15 GB
Shape,"(25488488, 629)","(3414112, 629)"
Count,9 Tasks,8 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 16.03 GB 2.15 GB Shape (25488488, 629) (3414112, 629) Count 9 Tasks 8 Chunks Type bool numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,2.15 GB
Shape,"(25488488, 629)","(3414112, 629)"
Count,9 Tasks,8 Chunks
Type,bool,numpy.ndarray


In [9]:
def call_rate(x, axis):
    return 1 - x.mean(axis=axis)

def filter_by_variant_call_rate(x, threshold):
    return x[call_rate(x, axis=1) >= threshold, :]

def filter_by_sample_call_rate(x, threshold):
    return x[:, call_rate(x, axis=0) >= threshold]

def qc(x):
    x = filter_by_variant_call_rate(x, .8)
    x = filter_by_sample_call_rate(x, .8)
    x = filter_by_variant_call_rate(x, .98)
    x = filter_by_sample_call_rate(x, .98)
    x.compute_chunk_sizes()
    return x

In [10]:
# Expected 1KG shape: (8240745, 629)
# Expected HapMap shape: (1430443, 165)
%time qc(arrf)

CPU times: user 3.78 s, sys: 686 ms, total: 4.46 s
Wall time: 1min 46s


Unnamed: 0,Array,Chunk
Bytes,5.18 GB,725.95 MB
Shape,"(8240745, 629)","(1154128, 629)"
Count,131 Tasks,8 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 5.18 GB 725.95 MB Shape (8240745, 629) (1154128, 629) Count 131 Tasks 8 Chunks Type bool numpy.ndarray",629  8240745,

Unnamed: 0,Array,Chunk
Bytes,5.18 GB,725.95 MB
Shape,"(8240745, 629)","(1154128, 629)"
Count,131 Tasks,8 Chunks
Type,bool,numpy.ndarray
