In [1]:
import os
import numpy as np
import xarray as xr
import pandas as pd
from dask.array.core import normalize_chunks
from pathlib import Path
from sgkit_bgen import read_bgen
from bokeh.io import output_notebook, export_png
from dask.diagnostics import ResourceProfiler, ProgressBar, Profiler, visualize
%load_ext autoreload
%autoreload 2
output_notebook()

In [2]:
import sys
sys.path.insert(0,'../scripts')
from convert_genetic_data import BGENPaths, Contig, load_bgen, load_bgen_samples, load_bgen_probabilities, save_dataset

In [3]:
paths = BGENPaths(
    bgen_path='/home/eczech/data/rs-ukb-local/bgen/ukb_imp_chrXY_v3.bgen',
    variants_path='/home/eczech/data/rs-ukb-local/bgen/ukb_mfi_chrXY_v3.txt',
    samples_path='/home/eczech/data/rs-ukb-local/bgen/ukb59384_imp_chrXY_v3_s486331.sample',
)

In [4]:
n_bytes = 536870912 # 512MiB
#n_bytes = 268435456 # 256MiB
n_variants = 1024
n_samples = (n_bytes // 4) // n_variants
n_variants, n_samples

(1024, 65536)

In [8]:
[t[0] for t in normalize_chunks('512MiB', shape=(90_000_000, 486443, 2), dtype='uint8')]

[16000, 16384, 2]

In [10]:
paths = BGENPaths(
    bgen_path='/home/eczech/data/rs-ukb-local/bgen/ukb_imp_chrXY_v3.bgen',
    variants_path='/home/eczech/data/rs-ukb-local/bgen/ukb_mfi_chrXY_v3.txt',
    samples_path='/home/eczech/data/rs-ukb-local/bgen/ukb59384_imp_chrXY_v3_s486331.sample',
)
contig = Contig(index=25, name='XY')
ds = load_bgen(paths, contig, chunks=(n_variants, n_samples, 3))
ds

Unnamed: 0,Array,Chunk
Bytes,178.65 GB,800.00 MB
Shape,"(45906, 486443, 2)","(10000, 10000, 2)"
Count,491 Tasks,245 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 178.65 GB 800.00 MB Shape (45906, 486443, 2) (10000, 10000, 2) Count 491 Tasks 245 Chunks Type float32 numpy.ndarray",2  486443  45906,

Unnamed: 0,Array,Chunk
Bytes,178.65 GB,800.00 MB
Shape,"(45906, 486443, 2)","(10000, 10000, 2)"
Count,491 Tasks,245 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,22.33 GB,100.00 MB
Shape,"(45906, 486443)","(10000, 10000)"
Count,1226 Tasks,245 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 22.33 GB 100.00 MB Shape (45906, 486443) (10000, 10000) Count 1226 Tasks 245 Chunks Type bool numpy.ndarray",486443  45906,

Unnamed: 0,Array,Chunk
Bytes,22.33 GB,100.00 MB
Shape,"(45906, 486443)","(10000, 10000)"
Count,1226 Tasks,245 Chunks
Type,bool,numpy.ndarray


In [11]:
ds.call_genotype_probability[:3, :3].compute()

In [7]:
ds.call_genotype_probability_mask[:3, :3].compute()

### Test Compute

In [7]:
%%time
with ProgressBar(), ResourceProfiler() as prof:
    m = ds['call_dosage'][:2500,:].mean(dim='samples').compute(scheduler='processes')

[########################################] | 100% Completed |  2min 18.6s
CPU times: user 43.3 s, sys: 1.11 s, total: 44.4 s
Wall time: 2min 19s


In [9]:
pd.Series(m).describe()

count    2500.000000
mean        0.115621
std         0.320614
min         0.000021
25%         0.000411
50%         0.001762
75%         0.011835
max         1.912868
dtype: float64

### Test Save Locally

In [6]:
dss = ds.sel(variants=np.s_[:2500])
dss

Unnamed: 0,Array,Chunk
Bytes,9.73 GB,536.87 MB
Shape,"(2500, 486443, 2)","(1024, 65536, 2)"
Count,745 Tasks,24 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 9.73 GB 536.87 MB Shape (2500, 486443, 2) (1024, 65536, 2) Count 745 Tasks 24 Chunks Type float32 numpy.ndarray",2  486443  2500,

Unnamed: 0,Array,Chunk
Bytes,9.73 GB,536.87 MB
Shape,"(2500, 486443, 2)","(1024, 65536, 2)"
Count,745 Tasks,24 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,67.11 MB
Shape,"(2500, 486443)","(1024, 65536)"
Count,1825 Tasks,24 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 1.22 GB 67.11 MB Shape (2500, 486443) (1024, 65536) Count 1825 Tasks 24 Chunks Type bool numpy.ndarray",486443  2500,

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,67.11 MB
Shape,"(2500, 486443)","(1024, 65536)"
Count,1825 Tasks,24 Chunks
Type,bool,numpy.ndarray


In [8]:
%%time
# Export sample WITHOUT GP compression filter (peaking at about 8G of 14G used on 4 workers w/ 256 MiB chunks)
with ProgressBar(), ResourceProfiler() as prof:
    save_dataset('/tmp/test.without_filter.zarr', dss, contig, scheduler='processes', remote=False, rescale_gp=False)

2020-08-20 14:55:11,433 | convert_genetic_data | INFO | Dataset for contig Contig(name='XY', index=25):
<xarray.Dataset>
Dimensions:                         (genotypes: 2, samples: 486443, variants: 2500)
Dimensions without coordinates: genotypes, samples, variants
Data variables:
    variant_id                      (variants) object 'X:60014_T_C' ... 'X:29...
    variant_rsid                    (variants) object 'rs370048753' ... 'rs28...
    variant_position                (variants) int32 60014 60014 ... 293124
    variant_allele1_ref             (variants) object 'T' 'T' 'C' ... 'G' 'A'
    variant_allele2_alt             (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_maf                     (variants) float64 0.0003958 ... 0.0008407
    variant_minor_allele            (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_info                    (variants) float64 0.7276 0.6867 ... 0.8284
    sample_id1                      (samples) int32 4476413 3205773 ... 4315851
    sam

In [7]:
%%time
# Export sample with GP compression filter (clevel 5)
with ProgressBar(), ResourceProfiler() as rprof, Profiler() as prof:
    save_dataset('/tmp/test.with_filter_clevel5.zarr', dss, contig, scheduler='processes', remote=False, rescale_gp=True)

2020-08-20 21:15:10,190 | convert_genetic_data | INFO | Dataset for contig Contig(name='XY', index=25):
<xarray.Dataset>
Dimensions:                         (genotypes: 2, samples: 486443, variants: 2500)
Dimensions without coordinates: genotypes, samples, variants
Data variables:
    variant_id                      (variants) object 'X:60014_T_C' ... 'X:29...
    variant_rsid                    (variants) object 'rs370048753' ... 'rs28...
    variant_position                (variants) int32 60014 60014 ... 293124
    variant_allele1_ref             (variants) object 'T' 'T' 'C' ... 'G' 'A'
    variant_allele2_alt             (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_maf                     (variants) float64 0.0003958 ... 0.0008407
    variant_minor_allele            (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_info                    (variants) float64 0.7276 0.6867 ... 0.8284
    sample_id1                      (samples) int32 4476413 3205773 ... 4315851
    sam

In [8]:
visualize([prof, rprof])

In [7]:
%%time
# Export sample with GP compression filter and clevel 7
with ProgressBar(), ResourceProfiler() as rprof, Profiler() as prof:
    save_dataset('/tmp/test.with_filter_clevel7.zarr', dss, contig, scheduler='processes', remote=False, rescale_gp=True)

2020-08-20 22:34:31,946 | convert_genetic_data | INFO | Dataset for contig Contig(name='XY', index=25):
<xarray.Dataset>
Dimensions:                         (genotypes: 2, samples: 486443, variants: 2500)
Dimensions without coordinates: genotypes, samples, variants
Data variables:
    variant_id                      (variants) object 'X:60014_T_C' ... 'X:29...
    variant_rsid                    (variants) object 'rs370048753' ... 'rs28...
    variant_position                (variants) int32 60014 60014 ... 293124
    variant_allele1_ref             (variants) object 'T' 'T' 'C' ... 'G' 'A'
    variant_allele2_alt             (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_maf                     (variants) float64 0.0003958 ... 0.0008407
    variant_minor_allele            (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_info                    (variants) float64 0.7276 0.6867 ... 0.8284
    sample_id1                      (samples) int32 4476413 3205773 ... 4315851
    sam

In [8]:
visualize([prof, rprof])

In [7]:
%%time
# Export sample with GP compression filter and high clevel (9 instead of 5)
with ProgressBar(), ResourceProfiler() as rprof, Profiler() as prof:
    save_dataset('/tmp/test.with_filter_clevel9.zarr', dss, contig, scheduler='processes', remote=False, rescale_gp=True)

2020-08-20 20:40:54,634 | convert_genetic_data | INFO | Dataset for contig Contig(name='XY', index=25):
<xarray.Dataset>
Dimensions:                         (genotypes: 2, samples: 486443, variants: 2500)
Dimensions without coordinates: genotypes, samples, variants
Data variables:
    variant_id                      (variants) object 'X:60014_T_C' ... 'X:29...
    variant_rsid                    (variants) object 'rs370048753' ... 'rs28...
    variant_position                (variants) int32 60014 60014 ... 293124
    variant_allele1_ref             (variants) object 'T' 'T' 'C' ... 'G' 'A'
    variant_allele2_alt             (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_maf                     (variants) float64 0.0003958 ... 0.0008407
    variant_minor_allele            (variants) object 'C' 'G' 'T' ... 'A' 'G'
    variant_info                    (variants) float64 0.7276 0.6867 ... 0.8284
    sample_id1                      (samples) int32 4476413 3205773 ... 4315851
    sam

In [11]:
visualize([prof, rprof])

In [10]:
# TODO: compare to test.zarr with dosage
! du -ch /tmp/test{,.with_filter_clevel5,.with_filter_clevel7,.with_filter_clevel9,.without_filter}.zarr --max-depth=1

20K	/tmp/test.zarr/variant_position
32K	/tmp/test.zarr/variant_info
16K	/tmp/test.zarr/variant_allele1_ref
16K	/tmp/test.zarr/variant_contig_name
16K	/tmp/test.zarr/variant_minor_allele
24K	/tmp/test.zarr/variant_id
1.5M	/tmp/test.zarr/sample_id1
28K	/tmp/test.zarr/variant_rsid
204K	/tmp/test.zarr/sample_sex
349M	/tmp/test.zarr/call_dosage
44K	/tmp/test.zarr/sample_missing
32K	/tmp/test.zarr/variant_maf
1.5M	/tmp/test.zarr/sample_id2
16K	/tmp/test.zarr/variant_allele2_alt
396K	/tmp/test.zarr/call_dosage_mask
16K	/tmp/test.zarr/variant_contig
352M	/tmp/test.zarr
20K	/tmp/test.with_filter_clevel5.zarr/variant_position
32K	/tmp/test.with_filter_clevel5.zarr/variant_info
16K	/tmp/test.with_filter_clevel5.zarr/variant_allele1_ref
16K	/tmp/test.with_filter_clevel5.zarr/variant_contig_name
16K	/tmp/test.with_filter_clevel5.zarr/variant_minor_allele
24K	/tmp/test.with_filter_clevel5.zarr/variant_id
1.6M	/tmp/test.with_filter_clevel5.zarr/sample_id1
28K	/tmp/test.with_filter_clevel5.zarr/varian

In [26]:
dsr = xr.open_zarr('/tmp/test.with_filter2.zarr', mask_and_scale=False, decode_cf=False)
dsr

Unnamed: 0,Array,Chunk
Bytes,2.43 GB,134.22 MB
Shape,"(2500, 486443, 2)","(1024, 65536, 2)"
Count,25 Tasks,24 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 2.43 GB 134.22 MB Shape (2500, 486443, 2) (1024, 65536, 2) Count 25 Tasks 24 Chunks Type uint8 numpy.ndarray",2  486443  2500,

Unnamed: 0,Array,Chunk
Bytes,2.43 GB,134.22 MB
Shape,"(2500, 486443, 2)","(1024, 65536, 2)"
Count,25 Tasks,24 Chunks
Type,uint8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,67.11 MB
Shape,"(2500, 486443)","(1024, 65536)"
Count,25 Tasks,24 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 1.22 GB 67.11 MB Shape (2500, 486443) (1024, 65536) Count 25 Tasks 24 Chunks Type bool numpy.ndarray",486443  2500,

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,67.11 MB
Shape,"(2500, 486443)","(1024, 65536)"
Count,25 Tasks,24 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.95 MB 243.22 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type int32 numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.95 MB 243.22 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type int32 numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.89 MB 486.45 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type object numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.89 MB 486.45 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type object numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.00 kB,5.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 5.00 kB 5.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type int16 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,5.00 kB,5.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 40.00 kB 40.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.00 kB,10.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.00 kB 10.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type int32 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,10.00 kB,10.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray


In [27]:
dsr.call_genotype_probability[:3, :3].compute()

In [15]:
dsr.call_genotype_probability[1:3, :].mean(dim='samples').compute()

In [18]:
%%time
with ProgressBar(), ResourceProfiler() as prof:
    m = dsr['call_dosage'][:2500,:].mean(dim='samples').compute(scheduler='processes')

[########################################] | 100% Completed |  9.0s
CPU times: user 232 ms, sys: 146 ms, total: 377 ms
Wall time: 10.1 s


In [20]:
pd.Series(m).describe()

count    2500.000000
mean        0.115621
std         0.320614
min         0.000021
25%         0.000411
50%         0.001762
75%         0.011835
max         1.912868
dtype: float64

### Test Save Remotely

In [25]:
%%time
with ProgressBar(), ResourceProfiler() as prof:
    save_dataset('gs://rs-ukb/prep-data/test.zarr', dss, contig, scheduler='processes', remote=True)

2020-08-19 00:02:10,341 | convert | INFO | Dataset for contig Contig(name='XY', index=25):
<xarray.Dataset>
Dimensions:               (samples: 486443, variants: 2500)
Dimensions without coordinates: samples, variants
Data variables:
    variant_id            (variants) object 'X:60014_T_C' ... 'X:293124_A_G'
    variant_rsid          (variants) object 'rs370048753' ... 'rs28621836'
    variant_position      (variants) int32 60014 60014 60017 ... 293106 293124
    variant_allele1_ref   (variants) object 'T' 'T' 'C' 'G' ... 'G' 'C' 'G' 'A'
    variant_allele2_alt   (variants) object 'C' 'G' 'T' 'C' ... 'C' 'T' 'A' 'G'
    variant_maf           (variants) float64 0.0003958 0.0005032 ... 0.0008407
    variant_minor_allele  (variants) object 'C' 'G' 'T' 'C' ... 'C' 'T' 'A' 'G'
    variant_info          (variants) float64 0.7276 0.6867 ... 0.3437 0.8284
    sample_id1            (samples) int32 4476413 3205773 ... 2850971 4315851
    sample_id2            (samples) int32 4476413 3205773 ...

In [26]:
import gcsfs
gcs = gcsfs.GCSFileSystem()
store = gcsfs.GCSMap('gs://rs-ukb/prep-data/test.zarr', gcs=gcs, check=False, create=False)
dsr = xr.open_zarr(store)
dsr

Unnamed: 0,Array,Chunk
Bytes,4.86 GB,536.87 MB
Shape,"(2500, 486443)","(1024, 131072)"
Count,13 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.86 GB 536.87 MB Shape (2500, 486443) (1024, 131072) Count 13 Tasks 12 Chunks Type float32 numpy.ndarray",486443  2500,

Unnamed: 0,Array,Chunk
Bytes,4.86 GB,536.87 MB
Shape,"(2500, 486443)","(1024, 131072)"
Count,13 Tasks,12 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,134.22 MB
Shape,"(2500, 486443)","(1024, 131072)"
Count,13 Tasks,12 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 1.22 GB 134.22 MB Shape (2500, 486443) (1024, 131072) Count 13 Tasks 12 Chunks Type bool numpy.ndarray",486443  2500,

Unnamed: 0,Array,Chunk
Bytes,1.22 GB,134.22 MB
Shape,"(2500, 486443)","(1024, 131072)"
Count,13 Tasks,12 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.95 MB 243.22 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type int32 numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1.95 MB 243.22 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type int32 numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,1.95 MB,243.22 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.89 MB 486.45 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type object numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 3.89 MB 486.45 kB Shape (486443,) (60806,) Count 9 Tasks 8 Chunks Type object numpy.ndarray",486443  1,

Unnamed: 0,Array,Chunk
Bytes,3.89 MB,486.45 kB
Shape,"(486443,)","(60806,)"
Count,9 Tasks,8 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.00 kB,5.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 5.00 kB 5.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type int16 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,5.00 kB,5.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 40.00 kB 40.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,40.00 kB,40.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,10.00 kB,10.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 10.00 kB 10.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type int32 numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,10.00 kB,10.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 20.00 kB 20.00 kB Shape (2500,) (2500,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",2500  1,

Unnamed: 0,Array,Chunk
Bytes,20.00 kB,20.00 kB
Shape,"(2500,)","(2500,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray


In [27]:
%%time
with ProgressBar(), ResourceProfiler() as prof:
    m = dsr['call_dosage'][:2500,:].mean(dim='samples').compute(scheduler='processes')

[########################################] | 100% Completed | 10.1s
CPU times: user 220 ms, sys: 142 ms, total: 362 ms
Wall time: 11.1 s


In [28]:
pd.Series(m).describe()

count    2500.000000
mean        0.115621
std         0.320614
min         0.000021
25%         0.000411
50%         0.001762
75%         0.011835
max         1.912868
dtype: float64