## Dask GWAS 1KG QC

Like the comparable Glow tutorial, this analysis will run QC for 1KG data using Dask for efficiency comparisons.

In [1]:
from pysnptools.snpreader import Bed
from dask.distributed import Client, progress
import dask.array as da
import os.path as osp
import matplotlib.pyplot as plt
import numpy as np
%run ../init/codecs.py
plink_file = osp.expanduser('~/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes')
expected_shape = (8240745, 629)

In [2]:
class BedArray(object):
    
    def __init__(self, bed):
        self.bed = bed
        self.shape = (bed.sid_count, bed.iid_count)
        self.dtype = np.uint8
        self.ndim = 2
        
    def __getitem__(self, idx):
        assert isinstance(idx, tuple)
        chunk = self.bed.__getitem__(idx[::-1]).read(dtype=np.float32)
        arr = chunk.val.T
        arr = np.nan_to_num(arr, nan=-1) + 1
        arr = arr.astype(np.uint8)
        return arr

In [3]:
def get_client(n_workers, n_threads=1):
    ml = str(128 // n_workers)
    client = Client(processes=True, threads_per_worker=n_threads, n_workers=n_workers, memory_limit=ml + 'GB')
    client.register_worker_plugin(CodecPlugin())
    return client

In [4]:
def filter_by_variant_call_rate(m, threshold):
    return m[m.mean(axis=1) >= threshold, :]

def filter_by_sample_call_rate(m, threshold):
    return m[:, m.mean(axis=0) >= threshold]

def qc(m):
    m = filter_by_variant_call_rate(m, .8)
    m = filter_by_sample_call_rate(m, .8)
    m = filter_by_variant_call_rate(m, .98)
    m = filter_by_sample_call_rate(m, .98)
    m.compute_chunk_sizes()
    return m

### QC Over PLINK

In [5]:
gt = da.from_array(BedArray(Bed(plink_file, count_A1=True)), chunks=(50000, 629), lock=False)
gt

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 31.45 MB Shape (25488488, 629) (50000, 629) Count 511 Tasks 510 Chunks Type uint8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray


In [6]:
client = get_client(n_workers=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:40369  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 128.00 GB


In [7]:
%%time
assert qc(gt > 0).shape == expected_shape

CPU times: user 20.6 s, sys: 7.21 s, total: 27.8 s
Wall time: 1min 24s


(8240745, 629)

### QC over Bitpacked Zarr

In [9]:
zarr_path = osp.join('/tmp', osp.basename(plink_file))
zarr_path

'/tmp/ALL.2of4intersection.20100804.genotypes'

In [10]:
%%time
da.to_zarr(gt, url=zarr_path, overwrite=True, filters=[PackGeneticBits()])

CPU times: user 15.2 s, sys: 5.45 s, total: 20.6 s
Wall time: 1min 48s


*Reset Client*: Use the maximum number of workers (with less memory) now that the far more memory-intensive PLINK read is done.

In [11]:
if 'client' in globals():
    client.close()
client = get_client(n_workers=16)
client

0,1
Client  Scheduler: tcp://127.0.0.1:38213  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 16  Memory: 128.00 GB


In [12]:
gt = da.from_zarr(url=zarr_path)
gt

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 16.03 GB 31.45 MB Shape (25488488, 629) (50000, 629) Count 511 Tasks 510 Chunks Type uint8 numpy.ndarray",629  25488488,

Unnamed: 0,Array,Chunk
Bytes,16.03 GB,31.45 MB
Shape,"(25488488, 629)","(50000, 629)"
Count,511 Tasks,510 Chunks
Type,uint8,numpy.ndarray


In [14]:
%%time
assert qc(gt > 0).shape == expected_shape

CPU times: user 9.79 s, sys: 851 ms, total: 10.6 s
Wall time: 35.3 s
