# Understanding the operations and the sizes of arrays in PC-Relate

Based on _pc_relate_dask_benchmark.ipynb_

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#%pip install --upgrade dask distributed dask_ml

In [2]:
from pathlib import Path
import sys
module_path = Path("..").resolve().as_posix()
if module_path not in sys.path:
    print(f"Adding {module_path} to python path")
    sys.path.append(module_path)

Adding /home/tom/repos/gwas-analysis/notebooks/platform/xarray to python path


In [3]:
import lib.io.pysnptools_backend
from lib.method.pc_relate.pc_relate import pc_relate, impute_with_variant_mean, gramian

In [4]:
from lib import api
import pandas as pd
import numpy as np
from dask_ml.decomposition import PCA
import dask
import dask.array as da
import seaborn as sns
import time
from distributed import Client
import timeit
from dataclasses import dataclass
import gc

## Dask client

In [7]:
c = Client(processes=True, n_workers=2, threads_per_worker=4)
c

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44871 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:43417  Dashboard: http://127.0.0.1:44871/status,Cluster  Workers: 2  Cores: 8  Memory: 12.83 GB


## Util functions

In [8]:
def compute_pca(g, display_pc_12: bool=True):
    pca = PCA(n_components=8, random_state=42)
    pca.fit(g)
    if display_pc_12:
        display(sns.scatterplot(x=pca.components_[0], y=pca.components_[1]))
    pcs = da.from_array(pca.components_)
    return pcs[:2,:]

In [5]:
def get_plink(samples: int, variants: int, population: int) -> dask.array:
    # This data was generated via hail using Balding-Nichols model
    #path = Path(f"{Path.home()}/data/tmp/mt_{samples}_{variants}_{population}.mt/data").as_posix()
    path = Path(f"{Path.home()}/projects/tmp/mt_{samples}_{variants}_{population}.mt/data").as_posix()
    ds = api.read_plink(path, chunks='auto', fam_sep=' ', bim_sep='\t')
    return ds.data.data

## PC-Relate

In [10]:
samples = 10_000
variants = 2_000
population = 10
g = get_plink(samples=samples, variants=variants, population=population)
g = g.persist()
g

Unnamed: 0,Array,Chunk
Bytes,20.00 MB,20.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,1 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 20.00 MB 20.00 MB Shape (2000, 10000) (2000, 10000) Count 1 Tasks 1 Chunks Type int8 numpy.ndarray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,20.00 MB,20.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,1 Tasks,1 Chunks
Type,int8,numpy.ndarray


In [11]:
da.to_npy_stack("/home/tom/data/pc_rel_10000", g)

In [61]:
%%time
pcs = compute_pca(g, False)
pcs = pcs.persist()
pcs

CPU times: user 401 ms, sys: 232 ms, total: 632 ms
Wall time: 6.04 s


Unnamed: 0,Array,Chunk
Bytes,160.00 kB,160.00 kB
Shape,"(2, 10000)","(2, 10000)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 160.00 kB 160.00 kB Shape (2, 10000) (2, 10000) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",10000  2,

Unnamed: 0,Array,Chunk
Bytes,160.00 kB,160.00 kB
Shape,"(2, 10000)","(2, 10000)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [57]:
maf = 0.01

This code is from `pc_relate.py`

In [62]:
%%time
missing_g_mask, imputed_g = impute_with_variant_mean(g)
pcsi = da.concatenate(
        [da.from_array(np.ones((1, pcs.shape[1]))), pcs], axis=0
    ).rechunk()
pcsi.compute()
pcsi

CPU times: user 27.9 ms, sys: 11.9 ms, total: 39.8 ms
Wall time: 52.9 ms


Unnamed: 0,Array,Chunk
Bytes,240.00 kB,240.00 kB
Shape,"(3, 10000)","(3, 10000)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 240.00 kB 240.00 kB Shape (3, 10000) (3, 10000) Count 5 Tasks 1 Chunks Type float64 numpy.ndarray",10000  3,

Unnamed: 0,Array,Chunk
Bytes,240.00 kB,240.00 kB
Shape,"(3, 10000)","(3, 10000)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [79]:
%%time
q, r = da.linalg.qr(pcsi.T)
da.compute(q, r)
q

KeyboardInterrupt: 

In [18]:
r

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (3, 3) (3, 3) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",3  3,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [64]:
%%time
inv_2r = da.linalg.inv(2 * r)
inv_2r.compute()
inv_2r

CPU times: user 44.3 ms, sys: 20 ms, total: 64.3 ms
Wall time: 111 ms


Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,25 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (3, 3) (3, 3) Count 25 Tasks 1 Chunks Type float64 numpy.ndarray",3  3,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,25 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [65]:
%%time
inv_2r_dotqt = inv_2r.dot(q.T)
inv_2r_dotqt.compute()
inv_2r_dotqt

CPU times: user 54.5 ms, sys: 13.5 ms, total: 68 ms
Wall time: 137 ms


Unnamed: 0,Array,Chunk
Bytes,240.00 kB,240.00 kB
Shape,"(3, 10000)","(3, 10000)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 240.00 kB 240.00 kB Shape (3, 10000) (3, 10000) Count 29 Tasks 1 Chunks Type float64 numpy.ndarray",10000  3,

Unnamed: 0,Array,Chunk
Bytes,240.00 kB,240.00 kB
Shape,"(3, 10000)","(3, 10000)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [66]:
%%time
half_beta = inv_2r_dotqt.dot(imputed_g.T)
half_beta.compute()
half_beta

CPU times: user 104 ms, sys: 27.2 ms, total: 131 ms
Wall time: 460 ms


Unnamed: 0,Array,Chunk
Bytes,48.00 kB,48.00 kB
Shape,"(3, 2000)","(3, 2000)"
Count,42 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 48.00 kB 48.00 kB Shape (3, 2000) (3, 2000) Count 42 Tasks 1 Chunks Type float64 numpy.ndarray",2000  3,

Unnamed: 0,Array,Chunk
Bytes,48.00 kB,48.00 kB
Shape,"(3, 2000)","(3, 2000)"
Count,42 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [67]:
%%time
mu = pcsi.T.dot(half_beta).T
mu.compute()
mu

CPU times: user 209 ms, sys: 428 ms, total: 636 ms
Wall time: 1.15 s


Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,46 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 160.00 MB 160.00 MB Shape (2000, 10000) (2000, 10000) Count 46 Tasks 1 Chunks Type float64 numpy.ndarray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,46 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [68]:
%%time
mask = (mu <= maf) | (mu >= 1.0 - maf) | missing_g_mask
mu_mask = da.ma.masked_array(mu, mask=mask)
mu_mask.compute()
mu_mask

CPU times: user 414 ms, sys: 2.53 s, total: 2.95 s
Wall time: 8.12 s


Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,51 Tasks,1 Chunks
Type,float64,numpy.MaskedArray
"Array Chunk Bytes 160.00 MB 160.00 MB Shape (2000, 10000) (2000, 10000) Count 51 Tasks 1 Chunks Type float64 numpy.MaskedArray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,51 Tasks,1 Chunks
Type,float64,numpy.MaskedArray


In [69]:
%%time
variance = mu_mask.map_blocks(lambda i: i * (1.0 - i))
variance = da.ma.filled(variance, fill_value=0.0)
variance.compute()
variance

CPU times: user 389 ms, sys: 523 ms, total: 912 ms
Wall time: 5.07 s


Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,53 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 160.00 MB 160.00 MB Shape (2000, 10000) (2000, 10000) Count 53 Tasks 1 Chunks Type float64 numpy.ndarray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,53 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [70]:
%%time
stddev = da.sqrt(variance)
stddev.compute()
stddev

CPU times: user 240 ms, sys: 413 ms, total: 653 ms
Wall time: 1.73 s


Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 160.00 MB 160.00 MB Shape (2000, 10000) (2000, 10000) Count 54 Tasks 1 Chunks Type float64 numpy.ndarray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [75]:
%%time
centered_af = g / 2 - mu_mask
centered_af = da.ma.filled(centered_af, fill_value=0.0)
centered_af.compute()
centered_af

CPU times: user 297 ms, sys: 466 ms, total: 763 ms
Wall time: 2.7 s


Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 160.00 MB 160.00 MB Shape (2000, 10000) (2000, 10000) Count 54 Tasks 1 Chunks Type float64 numpy.ndarray",10000  2000,

Unnamed: 0,Array,Chunk
Bytes,160.00 MB,160.00 MB
Shape,"(2000, 10000)","(2000, 10000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [76]:
%%time
gram1 = gramian(centered_af)
gram1.compute()
gram1

CPU times: user 1.29 s, sys: 2.74 s, total: 4.03 s
Wall time: 17.6 s


Unnamed: 0,Array,Chunk
Bytes,800.00 MB,800.00 MB
Shape,"(10000, 10000)","(10000, 10000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 800.00 MB Shape (10000, 10000) (10000, 10000) Count 58 Tasks 1 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,800.00 MB
Shape,"(10000, 10000)","(10000, 10000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [77]:
%%time
gram2 = gramian(stddev)
gram2.compute()
gram2

CPU times: user 44.6 s, sys: 48.5 s, total: 1min 33s
Wall time: 2min 36s


Unnamed: 0,Array,Chunk
Bytes,800.00 MB,800.00 MB
Shape,"(10000, 10000)","(10000, 10000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 800.00 MB Shape (10000, 10000) (10000, 10000) Count 58 Tasks 1 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,800.00 MB
Shape,"(10000, 10000)","(10000, 10000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [78]:
%%time
res = gram1 / gram2
res.compute()
res

KeyboardInterrupt: 

