In [1]:
import hail as hl
import pandas as pd
import numpy as np
import plotnine as pn
import matplotlib.pyplot as plt
hl.init() 

Running on Apache Spark version 2.4.4
SparkUI available at http://1a8bbd6c6b2b:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.32-a5876a0a2853
LOGGING: writing to /home/rav/repos/gwas-analysis/notebooks/benchmark/method/pcrelate/hail-20200417-1505-0.2.32-a5876a0a2853.log


In [2]:
from pathlib import Path

def write_mt(path: Path, n_samples: int, n_variants: int, n_populations: int) -> None:
    mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants)
    mt.write(path.as_posix(), overwrite=True)
    
def get_mt(n_samples: int, n_variants: int, n_populations: int) -> hl.MatrixTable:
    path_mt = Path(f"/home/rav/data/tmp/mt_{n_samples}_{n_variants}_{n_populations}.mt")
    if not path_mt.exists():
        write_mt(path_mt, n_samples, n_variants, n_populations)
    return hl.read_matrix_table(path_mt.as_posix())

In [3]:
for n in range(2000, 10001, 2000):
    mt = get_mt(n, n, 5)
    me = mt.key_cols_by()
    # we convert to string and add suffix otherwise having int sample ID can break KING (0)
    me = me.transmute_cols(sample_idx="S" + hl.str(me.sample_idx))
    me = me.key_cols_by(me.sample_idx)
    hl.export_plink(me, f"/home/rav/data/tmp/plink_{n}/data")

2020-04-17 15:06:39 Hail: INFO: merging 9 files totalling 976.6K...
2020-04-17 15:06:39 Hail: INFO: while writing:
    /home/rav/data/tmp/plink_2000/data.bed
  merge time: 19.608ms
2020-04-17 15:06:39 Hail: INFO: merging 8 files totalling 48.6K...
2020-04-17 15:06:39 Hail: INFO: while writing:
    /home/rav/data/tmp/plink_2000/data.bim
  merge time: 11.506ms
2020-04-17 15:06:39 Hail: INFO: merging 8 files totalling 32.1K...
2020-04-17 15:06:39 Hail: INFO: while writing:
    /home/rav/data/tmp/plink_2000/data.fam
  merge time: 11.383ms
2020-04-17 15:06:39 Hail: INFO: wrote 2000 variants and 2000 samples to '/home/rav/data/tmp/plink_2000/data'
2020-04-17 15:06:42 Hail: INFO: merging 9 files totalling 3.8M...
2020-04-17 15:06:42 Hail: INFO: while writing:
    /home/rav/data/tmp/plink_4000/data.bed
  merge time: 24.307ms
2020-04-17 15:06:42 Hail: INFO: merging 8 files totalling 99.4K...
2020-04-17 15:06:42 Hail: INFO: while writing:
    /home/rav/data/tmp/plink_4000/data.bim
  merge time: 

In [None]:
for n in range(20000, 1000001, 20000):
    mt = get_mt(n, n, 5)
    # here we assume that there is no recent relatedness in the sample, which should not
    # matter for the purpose of the benchmark 
    %timeit -n 1 -r 1 hl.methods.pc_relate(mt.GT, min_individual_maf=0.01, k=10)

2020-04-17 15:10:14 Hail: INFO: balding_nichols_model: generating genotypes for 5 populations, 20000 samples, and 20000 variants...
2020-04-17 15:10:17 Hail: INFO: Coerced sorted dataset
2020-04-17 15:11:14 Hail: INFO: wrote matrix table with 20000 rows and 20000 columns in 8 partitions to /home/rav/data/tmp/mt_20000_20000_5.mt
2020-04-17 15:11:23 Hail: INFO: hwe_normalized_pca: running PCA using 20000 variants.
2020-04-17 15:11:36 Hail: INFO: pca: running PCA with 10 components...
2020-04-17 15:14:27 Hail: INFO: Wrote all 25 blocks of 20000 x 20000 matrix with block size 4096.


3min 14s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


2020-04-17 15:14:29 Hail: INFO: balding_nichols_model: generating genotypes for 5 populations, 40000 samples, and 40000 variants...
2020-04-17 15:14:35 Hail: INFO: Coerced sorted dataset
2020-04-17 15:19:01 Hail: INFO: wrote matrix table with 40000 rows and 40000 columns in 11 partitions to /home/rav/data/tmp/mt_40000_40000_5.mt
2020-04-17 15:19:23 Hail: INFO: hwe_normalized_pca: running PCA using 40000 variants.
2020-04-17 15:20:01 Hail: INFO: pca: running PCA with 10 components...
