# Understanding the operations and the sizes of arrays in PC-Relate

Based on _pc_relate_dask_benchmark.ipynb_

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#%pip install --upgrade dask distributed dask_ml

In [3]:
from pathlib import Path
import sys
module_path = Path("..").resolve().as_posix()
if module_path not in sys.path:
    print(f"Adding {module_path} to python path")
    sys.path.append(module_path)

Adding /home/tom/repos/gwas-analysis/notebooks/platform/xarray to python path


In [19]:
import lib.io.pysnptools_backend
from lib.method.pc_relate.pc_relate import pc_relate, impute_with_variant_mean, gramian

In [5]:
from lib import api
import pandas as pd
import numpy as np
from dask_ml.decomposition import PCA
import dask
import dask.array as da
import seaborn as sns
import time
from distributed import Client
import timeit
from dataclasses import dataclass
import gc

## Dask client

In [7]:
c = Client(processes=True, n_workers=2, threads_per_worker=4)
c

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44871 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:43417  Dashboard: http://127.0.0.1:44871/status,Cluster  Workers: 2  Cores: 8  Memory: 12.83 GB


## Util functions

In [8]:
def compute_pca(g, display_pc_12: bool=True):
    pca = PCA(n_components=8, random_state=42)
    pca.fit(g)
    if display_pc_12:
        display(sns.scatterplot(x=pca.components_[0], y=pca.components_[1]))
    pcs = da.from_array(pca.components_)
    return pcs[:2,:]

In [9]:
def get_plink(samples: int, variants: int, population: int) -> dask.array:
    # This data was generated via hail using Balding-Nichols model
    #path = Path(f"{Path.home()}/data/tmp/mt_{samples}_{variants}_{population}.mt/data").as_posix()
    path = Path(f"{Path.home()}/projects/tmp/mt_{samples}_{variants}_{population}.mt/data").as_posix()
    ds = api.read_plink(path, chunks='auto', fam_sep=' ', bim_sep='\t')
    return ds.data.data

## PC-Relate

In [13]:
samples = 1000
variants = 2000
population = 10
g = get_plink(samples=samples, variants=variants, population=population)
g = g.persist()
g

Unnamed: 0,Array,Chunk
Bytes,2.00 MB,2.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,1 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 2.00 MB 2.00 MB Shape (2000, 1000) (2000, 1000) Count 1 Tasks 1 Chunks Type int8 numpy.ndarray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,2.00 MB,2.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,1 Tasks,1 Chunks
Type,int8,numpy.ndarray


In [14]:
pcs = compute_pca(g, False)
pcs = pcs.persist()
pcs

Unnamed: 0,Array,Chunk
Bytes,16.00 kB,16.00 kB
Shape,"(2, 1000)","(2, 1000)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 kB 16.00 kB Shape (2, 1000) (2, 1000) Count 1 Tasks 1 Chunks Type float64 numpy.ndarray",1000  2,

Unnamed: 0,Array,Chunk
Bytes,16.00 kB,16.00 kB
Shape,"(2, 1000)","(2, 1000)"
Count,1 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [21]:
maf = 0.01

This code is from `pc_relate.py`

In [16]:
missing_g_mask, imputed_g = impute_with_variant_mean(g)
pcsi = da.concatenate(
        [da.from_array(np.ones((1, pcs.shape[1]))), pcs], axis=0
    ).rechunk()
pcsi

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(3, 1000)","(3, 1000)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.00 kB 24.00 kB Shape (3, 1000) (3, 1000) Count 5 Tasks 1 Chunks Type float64 numpy.ndarray",1000  3,

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(3, 1000)","(3, 1000)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [17]:
q, r = da.linalg.qr(pcsi.T)
q

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(1000, 3)","(1000, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.00 kB 24.00 kB Shape (1000, 3) (1000, 3) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",3  1000,

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(1000, 3)","(1000, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [18]:
r

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (3, 3) (3, 3) Count 10 Tasks 1 Chunks Type float64 numpy.ndarray",3  3,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,10 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [23]:
inv_2r = da.linalg.inv(2 * r)
inv_2r

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,25 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 72 B 72 B Shape (3, 3) (3, 3) Count 25 Tasks 1 Chunks Type float64 numpy.ndarray",3  3,

Unnamed: 0,Array,Chunk
Bytes,72 B,72 B
Shape,"(3, 3)","(3, 3)"
Count,25 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [24]:
inv_2r_dotqt = inv_2r.dot(q.T)
inv_2r_dotqt

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(3, 1000)","(3, 1000)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 24.00 kB 24.00 kB Shape (3, 1000) (3, 1000) Count 29 Tasks 1 Chunks Type float64 numpy.ndarray",1000  3,

Unnamed: 0,Array,Chunk
Bytes,24.00 kB,24.00 kB
Shape,"(3, 1000)","(3, 1000)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [27]:
half_beta = inv_2r_dotqt.dot(imputed_g.T)
half_beta

Unnamed: 0,Array,Chunk
Bytes,48.00 kB,48.00 kB
Shape,"(3, 2000)","(3, 2000)"
Count,42 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 48.00 kB 48.00 kB Shape (3, 2000) (3, 2000) Count 42 Tasks 1 Chunks Type float64 numpy.ndarray",2000  3,

Unnamed: 0,Array,Chunk
Bytes,48.00 kB,48.00 kB
Shape,"(3, 2000)","(3, 2000)"
Count,42 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [28]:
mu = pcsi.T.dot(half_beta).T
mu

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,46 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 MB 16.00 MB Shape (2000, 1000) (2000, 1000) Count 46 Tasks 1 Chunks Type float64 numpy.ndarray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,46 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [30]:
mask = (mu <= maf) | (mu >= 1.0 - maf) | missing_g_mask
mu_mask = da.ma.masked_array(mu, mask=mask)
mu_mask

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,51 Tasks,1 Chunks
Type,float64,numpy.MaskedArray
"Array Chunk Bytes 16.00 MB 16.00 MB Shape (2000, 1000) (2000, 1000) Count 51 Tasks 1 Chunks Type float64 numpy.MaskedArray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,51 Tasks,1 Chunks
Type,float64,numpy.MaskedArray


In [32]:
variance = mu_mask.map_blocks(lambda i: i * (1.0 - i))
variance = da.ma.filled(variance, fill_value=0.0)
variance

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,53 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 MB 16.00 MB Shape (2000, 1000) (2000, 1000) Count 53 Tasks 1 Chunks Type float64 numpy.ndarray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,53 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [33]:
stddev = da.sqrt(variance)
stddev

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 MB 16.00 MB Shape (2000, 1000) (2000, 1000) Count 54 Tasks 1 Chunks Type float64 numpy.ndarray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [35]:
centered_af = g / 2 - mu_mask
centered_af = da.ma.filled(centered_af, fill_value=0.0)
centered_af

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 16.00 MB 16.00 MB Shape (2000, 1000) (2000, 1000) Count 54 Tasks 1 Chunks Type float64 numpy.ndarray",1000  2000,

Unnamed: 0,Array,Chunk
Bytes,16.00 MB,16.00 MB
Shape,"(2000, 1000)","(2000, 1000)"
Count,54 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [37]:
gram1 = gramian(centered_af)
gram1

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.00 MB 8.00 MB Shape (1000, 1000) (1000, 1000) Count 58 Tasks 1 Chunks Type float64 numpy.ndarray",1000  1000,

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [38]:
gram2 = gramian(stddev)
gram2

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.00 MB 8.00 MB Shape (1000, 1000) (1000, 1000) Count 58 Tasks 1 Chunks Type float64 numpy.ndarray",1000  1000,

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,58 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [39]:
res = gram1 / gram2
res

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,66 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.00 MB 8.00 MB Shape (1000, 1000) (1000, 1000) Count 66 Tasks 1 Chunks Type float64 numpy.ndarray",1000  1000,

Unnamed: 0,Array,Chunk
Bytes,8.00 MB,8.00 MB
Shape,"(1000, 1000)","(1000, 1000)"
Count,66 Tasks,1 Chunks
Type,float64,numpy.ndarray
