In [2]:
import hail as hl
import pandas as pd
import numpy as np
import plotnine as pn
import matplotlib.pyplot as plt
hl.init() 

Running on Apache Spark version 2.4.4
SparkUI available at http://941e42963a7b:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.32-a5876a0a2853
LOGGING: writing to /home/rav/repos/gwas-analysis/notebooks/benchmark/method/pcrelate/hail-20200422-1602-0.2.32-a5876a0a2853.log


In [3]:
from pathlib import Path

def write_mt(path: Path, n_samples: int, n_variants: int, n_populations: int) -> None:
    mt = hl.balding_nichols_model(n_populations=n_populations, n_samples=n_samples, n_variants=n_variants)
    mt.write(path.as_posix(), overwrite=True)
    
def get_mt(n_samples: int, n_variants: int, n_populations: int) -> hl.MatrixTable:
    path_mt = Path(f"/home/rav/data/tmp/mt_{n_samples}_{n_variants}_{n_populations}.mt")
    if not path_mt.exists():
        write_mt(path_mt, n_samples, n_variants, n_populations)
    return hl.read_matrix_table(path_mt.as_posix())

In [17]:
import time
import tempfile
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass

@dataclass
class Measurment:
    samples: int
    variants: int
    pca_time: float
    pc_relate_time: float
    
    def to_csv(self) -> str:
        return f"{self.samples},{self.variants},{self.pca_time:.2f},{self.pc_relate_time:.2f}"
        
measurments = Path("/home/rav/data/tmp/measurments")
measurments_dir = measurments.mkdir(parents=True, exist_ok=True)
measure_fd = measurments.joinpath(f"m_{datetime.now().strftime('%d_%m_%Y_%H_%M_%S')}").open("w")
measure_fd.write("samples,variants,pca,pc_relate\n")
measure_fd.flush()

m = []

def do_experiment(s: int, v: int, p: int) -> None:
    mt = get_mt(s, v, p)
    # here we assume that there is no recent relatedness in the sample, which should not
    # matter for the purpose of the benchmark
    start = time.time()
    _, scores, _ = hl.methods.hwe_normalized_pca(mt.GT, k=10, compute_loadings=False)
    pca_time = time.time() - start
    print(f"PCA took s:{s}, v:{v}, p:{p} : {pca_time:.2f} seconds")
    start = time.time()
    pc_relate_result = hl.methods.pc_relate(mt.GT, scores_expr=scores[mt.col_key].scores, min_individual_maf=0.01)
    pc_relate_result.write(Path(tempfile.mkdtemp()).joinpath("pc_relate_result").as_posix())
    pc_relate_time = time.time() - start
    tmp_m = Measurment(samples=s, variants=v, pca_time=pca_time, pc_relate_time=pc_relate_time)
    measure_fd.write(f"{tmp_m.to_csv()}\n")
    measure_fd.flush()
    m.append(tmp_m)

In [None]:
# plink data
for n in range(2000, 10001, 2000):
    mt = get_mt(n, n, 5)
    me = mt.key_cols_by()
    # we convert to string and add suffix otherwise having int sample ID can break KING (0)
    me = me.transmute_cols(sample_idx="S" + hl.str(me.sample_idx))
    me = me.key_cols_by(me.sample_idx)
    hl.export_plink(me, f"/home/rav/data/tmp/plink_{n}/data")

In [None]:
# square inputs
for n in range(5000, 35001, 5000):
    do_experiment(s=n, v=n, p=5)

In [None]:
# pure spark pca
from pyspark.mllib.linalg.distributed import RowMatrix
#from pyspark.mllib.random.RandomRDDs import RandomRDDs
from pyspark.mllib.random import RandomRDDs

sc = hl.spark_context()
mat = RowMatrix(RandomRDDs.uniformVectorRDD(sc, 5000, 5000, numPartitions=200))
pc = mat.computePrincipalComponents(10)
#projected = mat.multiply(pc)

In [None]:
for s in range(5000, 6000, 5000):
    for v in range(5000, 6000, 5000):
        #for p in range(10, 11, 2):
        p = 10
        print(f"Computing s: {s}, v: {v}, p: {p}")
        do_experiment(s=s, v=v, p=p)