### Hail BlockMatrix QC Tests

This experiment tests performance on variant/sample call rate filtering of 1KG data using the Hail BlockMatrix API rather than MatrixTable.

It is approximately 3x faster than MatrixTable for this one set of QC ops, but still nowhere near as fast as PLINK or Dask.

In [1]:
import hail as hl
import pandas as pd
import numpy as np
import plotnine as pn
import matplotlib.pyplot as plt
import os.path as osp
%run ../init/paths.py
data_dir = osp.expanduser('~/data/gwas/tutorial/2_PS_GWAS')
hl.init() 

Running on Apache Spark version 2.4.4
SparkUI available at http://d42c6af5a4e5:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/tutorial/02-population-stratification/hail-20200206-2049-0.2.30-2ae07d872f43.log


In [29]:
#mt = hl.balding_nichols_model(3, 25, 50)
mt = hl.read_matrix_table(osp.join(data_dir, PS1_1KG_RAW_FILE + '.mt'))

In [31]:
mt.count()

(25488488, 629)

In [3]:
hl.linalg.BlockMatrix.default_block_size()

4096

In [2]:
path = osp.join(data_dir, PS1_1KG_RAW_FILE + '.is_defined.bm')
path

'/home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes.is_defined.bm'

In [5]:
hl.is_defined(mt.GT).dtype

dtype('bool')

In [6]:
%%time
# block size 16384 gives OOM 
# block size 8192 gives java.lang.RuntimeException: error while applying lowering 'InterpretNonCompilable'
hl.linalg.BlockMatrix.write_from_entry_expr(hl.is_defined(mt.GT), path, overwrite=True, block_size=4096)

CPU times: user 212 ms, sys: 99.1 ms, total: 311 ms
Wall time: 22min 28s


2020-02-06 13:23:46 Hail: INFO: Wrote all 6223 blocks of 25488488 x 629 matrix with block size 4096.


In [3]:
bm = hl.linalg.BlockMatrix.read(path)

In [4]:
bm.is_sparse

False

In [5]:
bm.element_type

dtype('float64')

In [6]:
%%time

def filter_by_variant_call_rate(bm, threshold):
    idx = np.argwhere((bm.sum(axis=1) / bm.shape[1]).to_numpy().squeeze() >= threshold).squeeze()
    return bm.filter_rows(idx.tolist())

def filter_by_sample_call_rate(bm, threshold):
    idx = np.argwhere((bm.sum(axis=0) / bm.shape[0]).to_numpy().squeeze() >= threshold).squeeze()
    return bm.filter_cols(idx.tolist())

bmf = filter_by_variant_call_rate(bm, .8)
bmf = filter_by_sample_call_rate(bmf, .8)
bmf = filter_by_variant_call_rate(bmf, .98)
bmf = filter_by_sample_call_rate(bmf, .98)
bmf.shape

CPU times: user 20 s, sys: 2.13 s, total: 22.1 s
Wall time: 6min 56s


(8240745, 629)

In [27]:
%%time

# Try using matrix + vector multiplication instead of sums 

def filter_by_variant_call_rate(bm, threshold):
    cr = ((bm @ np.ones((bm.shape[1], 1))) / bm.shape[1]).to_numpy().squeeze()
    idx = np.argwhere(cr >= threshold).squeeze()
    return bm.filter_rows(idx.tolist())

def filter_by_sample_call_rate(bm, threshold):
    cr = ((bm.T @ np.ones((bm.shape[0], 1))) / bm.shape[0]).to_numpy().squeeze()
    idx = np.argwhere(cr >= threshold).squeeze()
    return bm.filter_cols(idx.tolist())

bmf = filter_by_variant_call_rate(bm, .8)
bmf = filter_by_sample_call_rate(bmf, .8)
bmf = filter_by_variant_call_rate(bmf, .98)
bmf = filter_by_sample_call_rate(bmf, .98)
bmf.shape

CPU times: user 2min 7s, sys: 3.12 s, total: 2min 10s
Wall time: 12min 31s


(8240745, 629)