## LSH Data Generator

Exports PLINK, parquet, and zarr datasets for a single dataset (currently either simulated or sampled from HapMap)

In [1]:
import os
import hail as hl
import numpy as np
from gwas_analysis.dask import io
from pysnptools import snpreader
import gwas_analysis.simulation.datasets as gsd
import dask.array as da
%run {os.environ['NB_DIR']}/nb.py
%run $BENCHMARK_METHOD_DIR/common.py

sample_rate = .1
ds_name = DATASET_HM

# sample_rate = 1
# ds_name = DATASET_SIM

ds_config = DATASET_CONFIG[ds_name]
ds_export_path = dataset_path(ds_name, sr=sample_rate)
hail_init()

Running on Apache Spark version 2.4.4
SparkUI available at http://2e4e0c6972f9:4043
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.32-a5876a0a2853
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/benchmark/method/ld_prune/lsh/hail-20200223-2350-0.2.32-a5876a0a2853.log


In [2]:
if ds_name == DATASET_SIM:
    # Make sure a single contig is used for comparison to Hail results
    # (Hail returns 0s for variants on unequal contigs)
    mt = gsd.get_ldsim_dataset(n_variants=256, n_samples=6, n_contigs=1, seed=1)
else:
    mt = hl.import_plink(
        *plink_files(osp.dirname(ds_config['path']), osp.basename(ds_config['path'])),
        skip_invalid_loci=False,
        reference_genome=ds_config['reference_genome']
    )
    mt = mt.filter_rows(mt.locus.contig == '1')

mt = mt.sample_rows(p=sample_rate, seed=1)

print('Shape before removing rows with no variance:', mt.count())
mt = mt.annotate_rows(stdev=hl.agg.stats(mt.GT.n_alt_alleles()).stdev)
mt = mt.filter_rows(mt.stdev > 0)

mt.count()

2020-02-23 23:51:03 Hail: INFO: Found 165 samples in fam file.
2020-02-23 23:51:03 Hail: INFO: Found 1457897 variants in bim file.
2020-02-23 23:51:11 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:11 Hail: INFO: reading 1 of 2 data partitions


Shape before removing rows with no variance: (12109, 165)


2020-02-23 23:51:18 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:18 Hail: INFO: reading 1 of 2 data partitions


(10451, 165)

### Export Plink

In [3]:
def export(mt, path):
    hl.export_plink(
        mt, path, 
        fam_id=mt.fam_id,
        pat_id=mt.pat_id,
        mat_id=mt.mat_id,
        is_female=mt.is_female,
        pheno=mt.is_case,
        varid=mt.rsid
    )
export(mt, ds_export_path)
ds_export_path

2020-02-23 23:51:36 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:36 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:51:41 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:41 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:51:46 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:46 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:51:51 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:51 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:51:56 Hail: INFO: Coerced sorted dataset
2020-02-23 23:51:56 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:52:04 Hail: INFO: merging 2 files totalling 428.7K...
2020-02-23 23:52:04 Hail: INFO: while writing:
    /home/eczech/data/gwas/benchmark/datasets/hapmap-sr=0.1.bed
  merge time: 23.359ms
2020-02-23 23:52:04 Hail: INFO: merging 1 files totalling 303.6K...
2020-02-23 23:52:04 Hail: INFO: while writing:
    /home/eczech/data/gwas/benchmark/datasets/hapmap-sr=0.1.bim
  merge time: 16.373ms
2020-0

'/home/eczech/data/gwas/benchmark/datasets/hapmap-sr=0.1'

### Export Parquet

In [4]:
# Note: mean imputation might be useful here 
bm = hl.linalg.BlockMatrix.from_entry_expr(hl.coalesce(mt.GT.n_alt_alleles(), -1))

2020-02-23 23:52:10 Hail: INFO: Coerced sorted dataset
2020-02-23 23:52:10 Hail: INFO: reading 1 of 2 data partitions
2020-02-23 23:52:26 Hail: INFO: Wrote all 3 blocks of 10451 x 165 matrix with block size 4096.


In [5]:
bt = bm.to_table_row_major()
bt.describe()

2020-02-23 23:52:26 Hail: INFO: wrote matrix with 10451 rows and 165 columns as 3 blocks of size 4096 to file:/tmp/hail.zOABc1UrD7Yv/xs06FhFey6


----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'row_idx': int64 
    'entries': array<float64> 
----------------------------------------
Key: ['row_idx']
----------------------------------------


In [6]:
path = ds_export_path + '.parquet'
bt.to_spark().write.parquet(path, mode='overwrite')
!du -ch $path

452K	/home/eczech/data/gwas/benchmark/datasets/hapmap-sr=0.1.parquet
452K	total


### Export Zarr

In [7]:
client = get_dask_client(n_workers=4)
client

0,1
Client  Scheduler: tcp://127.0.0.1:36565  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 120.00 GB


In [8]:
gt = da.from_array(io.BedArray(snpreader.Bed(ds_export_path, count_A1=True)), lock=False)
# Convert 0=missing, 1=homo ref, etc to -1=missing, 0=homo ref
gt = gt.astype(np.int8) - 1
gt

Unnamed: 0,Array,Chunk
Bytes,1.72 MB,1.72 MB
Shape,"(10451, 165)","(10451, 165)"
Count,4 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 1.72 MB 1.72 MB Shape (10451, 165) (10451, 165) Count 4 Tasks 1 Chunks Type int8 numpy.ndarray",165  10451,

Unnamed: 0,Array,Chunk
Bytes,1.72 MB,1.72 MB
Shape,"(10451, 165)","(10451, 165)"
Count,4 Tasks,1 Chunks
Type,int8,numpy.ndarray


In [9]:
np.unique(gt.compute(), return_counts=True)

(array([-1,  0,  1,  2], dtype=int8),
 array([   4195, 1066270,  528602,  125348]))

In [10]:
path = ds_export_path + '.zarr'
gt.to_zarr(path, overwrite=True)
!du -ch $path

828K	/home/eczech/data/gwas/benchmark/datasets/hapmap-sr=0.1.zarr
828K	total
