In [None]:
import hail as hl
from hail.plot import show
from pprint import pprint

hl.init(default_reference = "GRCh38", min_block_size=128, 
        spark_conf={'spark.driver.memory': '40g', 'spark.task.maxFailures': '20', 'spark.master': 'local[20,20]'})

In [None]:
all_datasets = [hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr1.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr2.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr3.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr4.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr5.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr6.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr7.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr8.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr9.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr10.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr11.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr12.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr13.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr14.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr15.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr16.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr17.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr18.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr19.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr20.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr21.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chr22.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chrX.GT_multi.variant.filtered.mt"),
                hl.read_matrix_table("~/WGS/BRAVA/checkpoint/chrY.GT_multi.variant.filtered.mt")]

In [None]:
merged_filtered_exome = hl.MatrixTable.union_rows(*all_datasets)

In [None]:
merged_filtered_exome.write("~/WGS/BRAVA/merged/CCPM_Exome_Freeze_Three.mt", overwrite = True)

In [None]:
merged_filtered_exome.repartition(2048).write("~/WGS/BRAVA/merged/CCPM_Exome_Freeze_Three.2048.mt", overwrite = True)

In [None]:
# removing column data and generating vcf for annotation

FILTERED_MT = "~/WGS/BRAVA/merged/CCPM_Exome_Freeze_Three.2048.mt"
mt = hl.read_matrix_table(FILTERED_MT)

samples_to_keep = {''}
set_to_keep = hl.literal(samples_to_keep)
mt_annotation = mt.filter_cols(set_to_keep.contains(mt['s']))

hl.export_vcf(mt_annotation, '~/WGS/BRAVA/variants_for_annotation/CCPM_Exome_Freeze_Three.annotation.vcf.bgz')

In [None]:
mt_annotation.count()

In [None]:
# exporting filtered genetic data to plink for PCA and major continental ancestry estimates
FILTERED_MT = "~/WGS/BRAVA/merged/CCPM_Exome_Freeze_Three.2048.mt"
mt = hl.read_matrix_table(FILTERED_MT)
hl.export_plink(mt, '~/WGS/BRAVA/bed/CCPM_Exome_Freeze_Three')


In [None]:
# exporting by chromosome as 
for chr in ["21", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "10", "22", "1", "2",
           "3", "4", "5", "6", "7", "8", "9", "X"]:
    FILTERED_MT = 'gs://hail-brava/GT_and_variant_filtered/COLORADO_Freeze_Two.chr' + chr + '_PAIR_ids_GT_and_variant_filtered.mt'
    FILTERED_VCF = 'gs://hail-brava/filtered_exome_for_annotation/COLORADO_Freeze_Two.chr' + chr + '.BRaVa.annotation.vcf.bgz'
    
    mt = hl.read_matrix_table(FILTERED_MT)
    
    # removing column data to generate vcf for annotation
    samples_to_keep = {''}
    set_to_keep = hl.literal(samples_to_keep)
    mt_annotation = mt.filter_cols(set_to_keep.contains(mt['s']))

    hl.export_vcf(mt_annotation, FILTERED_VCF)
    