In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hail as hl
import os.path as osp
import multiprocessing

# Benchmarking utilities
%run ../init/benchmark.py
register_timeop_magic(get_ipython(), 'hail')
data_dir = osp.expanduser('~/data/gwas/tutorial/2_PS_GWAS')

hl.init() 

Running on Apache Spark version 2.4.4
SparkUI available at http://3d498b83ee57:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/tutorial/02-population-stratification/hail-20200119-0238-0.2.30-2ae07d872f43.log


In [2]:
#hl.spark_context()._conf.set("spark.sql.shuffle.partitions", 1)

In [2]:
# # Load PLINK dataset using our custom reference
# PS1_FILE = "ALL.2of4intersection.20100804.genotypes_no_missing_IDs"
# mt = hl.import_plink(
#     osp.join(data_dir, PS1_FILE + '.bed'),
#     osp.join(data_dir, PS1_FILE + '.bim'),
#     osp.join(data_dir, PS1_FILE + '.fam'),
#     skip_invalid_loci=True,
#     #reference_genome='hapmap3_hg18'
# )

2020-01-16 13:33:15 Hail: INFO: Found 629 samples in fam file.
2020-01-16 13:33:15 Hail: INFO: Found 25488488 variants in bim file.


In [3]:
# Load PLINK dataset using our custom reference
PS1_FILE = "ALL.2of4intersection.20100804.genotypes_no_missing_IDs"
#PS1_FILE = "ALL.2of4intersection.20100804.genotypes"
mt = hl.import_plink(
    osp.join(data_dir, PS1_FILE + '.bed'),
    osp.join(data_dir, PS1_FILE + '.bim'),
    osp.join(data_dir, PS1_FILE + '.fam'),
    skip_invalid_loci=False,
    reference_genome='GRCh37'
)

2020-01-16 15:39:45 Hail: INFO: Found 629 samples in fam file.
2020-01-16 15:39:45 Hail: INFO: Found 25488488 variants in bim file.


In [4]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'fam_id': str
    'pat_id': str
    'mat_id': str
    'is_female': bool
    'is_case': bool
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'cm_position': float64
----------------------------------------
Entry fields:
    'GT': call
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [5]:
mt.write(osp.join(data_dir, 'ALL.2of4intersection.20100804.genotypes_no_missing_IDs.mt'), overwrite=True)

2020-01-16 15:41:44 Hail: INFO: Coerced sorted dataset
2020-01-16 15:46:06 Hail: INFO: wrote matrix table with 25488488 rows and 629 columns in 120 partitions to /home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes_no_missing_IDs.mt


In [2]:
mt = hl.read_matrix_table(osp.join(data_dir, 'ALL.2of4intersection.20100804.genotypes_no_missing_IDs.mt'))

In [3]:
mt.count()

(25488488, 629)

In [4]:
rows = mt.rows()
cts = rows.group_by(rows.rsid).aggregate(n=hl.agg.count())

In [7]:
cts.group_by(cts.n).aggregate(nn=hl.agg.count()).show()

2020-01-16 15:55:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-01-16 15:55:45 Hail: INFO: Coerced sorted dataset


n,nn
int64,int64
1,25488488


#### Write vcf for glow 

In [2]:
mt = hl.read_matrix_table(osp.join(data_dir, 'ALL.2of4intersection.20100804.genotypes_no_missing_IDs.mt'))

In [7]:
# Result must be less than 2G to not produce overflow in PLINKFileFormat within Glow
hl.export_plink(mt.head(10000000), osp.join(data_dir, 'ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt'))

2020-01-19 03:24:20 Hail: INFO: merging 49 files totalling 1.5G...
2020-01-19 03:24:21 Hail: INFO: while writing:
    /home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.bed
  merge time: 1.481s
2020-01-19 03:24:21 Hail: INFO: merging 48 files totalling 332.1M...
2020-01-19 03:24:22 Hail: INFO: while writing:
    /home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.bim
  merge time: 370.854ms
2020-01-19 03:24:22 Hail: INFO: merging 16 files totalling 11.7K...
2020-01-19 03:24:22 Hail: INFO: while writing:
    /home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.fam
  merge time: 5.362ms
2020-01-19 03:24:22 Hail: INFO: wrote 10000000 variants and 629 samples to '/home/eczech/data/gwas/tutorial/2_PS_GWAS/ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt'


```
cat ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.fam | sed 's/\t/ /g' > ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.fam.tmp

mv ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.fam.tmp ALL.2of4intersection.20100804.genotypes_no_missing_IDs.glowmt.fam
```