### Dask Simulations

This notebook will generate simulated genetic datasets to test Dask operations over.

In [1]:
from dask.distributed import Client, progress
import dask.array as da
import matplotlib.pyplot as plt
import numpy as np
#client = Client(processes=True, threads_per_worker=1, n_workers=16, memory_limit='8GB')
client = Client(processes=True, threads_per_worker=1, n_workers=8, memory_limit='16GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:37659  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 128.00 GB


In [2]:
%run ../../init/codecs.py
client.register_worker_plugin(CodecPlugin())
None

In [3]:
# 1kg: 25488488 x 629 (796,515.25 variants on 32 partitions)
# 3krg: 29635224 x 3024
#x = da.random.random((10000, 10000), chunks=(1000, 1000))
da.random.seed(1)

# Small simulations
# data_shape = (100000, 5000)
# chunk_shape = (10000, 5000)

# 1KG simulations
data_shape = (25488488, 629)
chunk_shape = (796516, 629)
# chunk_shape = (500000, 629)
#x = da.random.random((25488488, 629), chunks=(796516, 629))
#x = da.random.random((25488488, 629), chunks=(500000, 629))

# 3KRG simulations
# data_shape = (29635224, 3024)
# chunk_shape = (100000, 3024)

In [4]:
x = da.random.random(data_shape, chunks=chunk_shape)
x = (x > .2).astype(np.uint8)
x

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,501.01 MB
Shape,"(25488488, 629)","(796516, 629)"
Count,96 Tasks,32 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 501.01 MB Shape (25488488, 629) (796516, 629) Count 96 Tasks 32 Chunks Type uint8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,501.01 MB
Shape,"(25488488, 629)","(796516, 629)"
Count,96 Tasks,32 Chunks
Type,uint8,numpy.ndarray


In [5]:
da.to_zarr(x, url='/tmp/snp_zarr_uint8', overwrite=True)

In [6]:
path = '/tmp/snp_zarr_pgb'

In [7]:
%%time
# from numcodecs import Blosc
# Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
# from zarr.codecs import PackBits
# from numcodecs import PackBits
da.to_zarr(x, url=path, overwrite=True, filters=[PackGeneticBits()])

CPU times: user 3.47 s, sys: 698 ms, total: 4.17 s
Wall time: 43.9 s


In [5]:
assert np.all(da.from_zarr(url='/tmp/snp_zarr_pgb')[:100,:100].compute() == x[:100,:100].compute())

In [5]:
%%time
# Flow of 1kg QC from PLINK:
# 16481066 variants removed due to missing genotype data (--geno).
# 0 people removed due to missing genotype data (--mind).
# 766677 variants removed due to missing genotype data (--geno).
# 0 people removed due to missing genotype data (--mind).
# 8240745 = target variant count, 629 = target sample count
a1 = da.from_zarr(url=path)

mv = da.arange(0, a1.shape[0], 1, chunks=-1)
r = a1.mean(axis=1) # per-variant call rates
m1 = r > 0.8060413354531002 # np.quantile(r, 16481066.0/25488488)

mv1 = mv[m1]
a2 = a1[m1,:]
r = a2.mean(axis=0) # per-sample call rates
m2 = r > 0 # Create all true mask (eliminate no samples)

a3 = a2[:,m2]
r = a3.mean(axis=1)
m3 = r > 0.8076311605723371 # np.quantile(r, 766677/8591238)

mv2 = mv1[m3]
a4 = a3[m3,:]
r = a4.mean(axis=0) # per-sample call rates
m4 = r > 0 # Create all true mask (eliminate no samples)

a5 = a4[:,m4]
a5

CPU times: user 75.2 ms, sys: 11 ms, total: 86.2 ms
Wall time: 84 ms


Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, nan)","(nan, nan)"
Count,4068 Tasks,297 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes unknown unknown Shape (nan, nan) (nan, nan) Count 4068 Tasks 297 Chunks Type uint8 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, nan)","(nan, nan)"
Count,4068 Tasks,297 Chunks
Type,uint8,numpy.ndarray


In [5]:
%%time
a1[mv2.compute(),:]

CPU times: user 2.65 s, sys: 354 ms, total: 3 s
Wall time: 12.2 s


Unnamed: 0,Array,Chunk
Bytes,4.83 GB,151.44 MB
Shape,"(7675363, 629)","(240763, 629)"
Count,65 Tasks,32 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 4.83 GB 151.44 MB Shape (7675363, 629) (240763, 629) Count 65 Tasks 32 Chunks Type bool numpy.ndarray",629  7675363,

Unnamed: 0,Array,Chunk
Bytes,4.83 GB,151.44 MB
Shape,"(7675363, 629)","(240763, 629)"
Count,65 Tasks,32 Chunks
Type,bool,numpy.ndarray


In [11]:
%%time
mv2.compute_chunk_sizes()
mv2 = mv2.rechunk(chunks=-1)
mv2

CPU times: user 19.3 s, sys: 1.85 s, total: 21.1 s
Wall time: 2min 42s


Unnamed: 0,Array,Chunk
Bytes,34.28 MB,34.28 MB
Shape,"(4285342,)","(4285342,)"
Count,4858 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 34.28 MB 34.28 MB Shape (4285342,) (4285342,) Count 4858 Tasks 1 Chunks Type int64 numpy.ndarray",4285342  1,

Unnamed: 0,Array,Chunk
Bytes,34.28 MB,34.28 MB
Shape,"(4285342,)","(4285342,)"
Count,4858 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [12]:
%%time
ar = a1[mv2.compute(),:]
ar = ar.rechunk(chunks=chunk_shape)
ar

CPU times: user 18.8 s, sys: 1.83 s, total: 20.7 s
Wall time: 2min 41s


Unnamed: 0,Array,Chunk
Bytes,12.96 GB,302.40 MB
Shape,"(4285342, 3024)","(100000, 3024)"
Count,722 Tasks,43 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 12.96 GB 302.40 MB Shape (4285342, 3024) (100000, 3024) Count 722 Tasks 43 Chunks Type uint8 numpy.ndarray",3024  4285342,

Unnamed: 0,Array,Chunk
Bytes,12.96 GB,302.40 MB
Shape,"(4285342, 3024)","(100000, 3024)"
Count,722 Tasks,43 Chunks
Type,uint8,numpy.ndarray


In [13]:
%%time
ar = a1[mv2.compute(),:]
ar = ar.rechunk(chunks=chunk_shape)
ar.to_zarr(url='/tmp/snp_zarr_res', overwrite=True, filters=[PackGeneticBits()])

CPU times: user 34.8 s, sys: 3.89 s, total: 38.7 s
Wall time: 5min 29s


In [15]:
ar.shape

(4285342, 3024)

