In [1]:
import hail as hl
hl.init(default_reference='GRCh38')

Running on Apache Spark version 3.1.1
SparkUI available at http://dataproc-vlad-m.australia-southeast1-a.c.tob-wgs.internal:36165
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.73-2ab84582d24d
LOGGING: writing to /home/hail/hail-20210923-0916-0.2.73-2ab84582d24d.log


In [4]:
import hail as hl
import numpy as np
from ipywidgets import interact
import math
import pandas as pd
from collections import OrderedDict
import json
from os.path import join

import bokeh
from bokeh.layouts import gridplot, row, widgetbox
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook, push_notebook, export_png
from bokeh.models.widgets import Tabs, Panel
from bokeh.palettes import *
from bokeh.models import *
from typing import *
from bokeh.plotting.helpers import stack
from bokeh.transform import factor_cmap

from gnomad.variant_qc.evaluation import add_rank
from joint_calling import utils

overwrite = True
work_bucket = 'gs://cpg-tob-wgs-test-analysis/jupyter/vsavelyev'
TRUTH_SAMPLE = 'NA12878'

from bokeh.io import show, output_notebook, reset_output
from bokeh.layouts import gridplot
output_notebook()

gvcf_by_key = {
    'from_broad_gvcf': 'gs://cpg-thousand-genomes-test/gvcf/NA12878-from-broad-gvcf.g.vcf.gz',
    'from_cram'      : 'gs://cpg-thousand-genomes-test/gvcf/NA12878-from-cram.g.vcf.gz',
    'realign'        : 'gs://cpg-thousand-genomes-test/gvcf/NA12878-realign.g.vcf.gz',
    'hc_realign'     : 'gs://cpg-thousand-genomes-test/gvcf/NA12878-hc-from-bam.g.vcf.gz',
#     'hc_realign'     : 'gs://cpg-thousand-genomes-test/gvcf/NA12878-hc-realign.g.vcf.gz',
}

# fewgenomes_v2_mt_path = 'gs://cpg-fewgenomes-main/mt/v2-nonref.mt'
# fewgenomes_v2_raw_mt_path = 'gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/v2-raw.mt'
# meta_v2_ht_path = 'gs://cpg-fewgenomes-main-metadata/joint-calling/v2/meta.ht'
# vqsr_final_filter_ht_path = 'gs://cpg-fewgenomes-main-tmp/joint-calling/v2/variant_qc/vqsr/final-filter.ht'
highconf_ht_path = 'gs://cpg-reference/validation/giab/regions/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht/'

def filter_highconf(mt):
    highconf_bed_ht = hl.read_table(highconf_ht_path)
    mt = mt.filter_rows(hl.is_defined(highconf_bed_ht[mt.locus]))
    return mt

def print_conc_summary(summary):
    print('Summary:')
    print(summary)
    tp = summary[2][2] + summary[3][3] + summary[4][4]
    total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - tp
    called = sum([sum(s[:]) for s in summary[2:]])
    truth = sum([sum(s[2:]) for s in summary[:]])
    fp = called - tp
    fn = truth - tp
    precision = tp / called
    recall = tp / truth
    print(f'Left = called: {called}')
    print(f'Right = truth: {truth}')
    print(f'Unique to left = FP [precision]: {fp} [{precision:.2%}]')
    print(f'Unique to right = FN [recall]: {fn} [{recall:.2%}]')
    print(f'Concordant = TP [discordant]: {tp} [{total_discordant}]')

In [5]:
realign_mt_path = join(work_bucket, 'from_gvcfs', f'hc_realign.mt')
mt = hl.read_matrix_table(realign_mt_path)
mt.count()

(13986427, 1)

In [13]:
realign_mt_path = join(work_bucket, 'from_gvcfs', f'realign.mt')
mt = hl.read_matrix_table(realign_mt_path)
mt.show()

locus,alleles
locus<GRCh38>,array<str>
chr1:1,"[""N"",""<NON_REF>""]"
chr1:10013,"[""T"",""<NON_REF>""]"
chr1:10114,"[""T"",""<NON_REF>""]"
chr1:10116,"[""A"",""<NON_REF>""]"
chr1:10120,"[""T"",""<NON_REF>""]"
chr1:10121,"[""A"",""<NON_REF>""]"
chr1:10126,"[""T"",""<NON_REF>""]"
chr1:10127,"[""A"",""<NON_REF>""]"
chr1:10132,"[""T"",""<NON_REF>""]"
chr1:10133,"[""A"",""<NON_REF>""]"


In [15]:
mt_by_key = dict()
for k, gvcf_path in gvcf_by_key.items():
    mt_path = join(work_bucket, 'from_gvcfs', f'{k}.mt')
    if not utils.file_exists(mt_path):
        mt = hl.import_vcf(gvcf_path, force_bgz=True) 
        mt.write(mt_path, overwrite=True)
    mt = hl.read_matrix_table(mt_path)
    mt_by_key[k] = mt
    print(k)
    print(mt.count())

from_broad_gvcf
(20370777, 1)
from_cram
(20358543, 1)
realign
(19653459, 1)
hc_realign
(13986427, 1)


In [16]:
mt_by_key

{'from_broad_gvcf': <hail.matrixtable.MatrixTable at 0x7ff7f65bdf10>,
 'from_cram': <hail.matrixtable.MatrixTable at 0x7ff7f660e490>,
 'realign': <hail.matrixtable.MatrixTable at 0x7ff7f65ef8b0>,
 'hc_realign': <hail.matrixtable.MatrixTable at 0x7ff7f66f5af0>}

# Compare downloaded with recalled gvcf

In [17]:
mt_by_key = {
    k: hl.split_multi(mt)
    for k, mt in mt_by_key.items()
}

mt_by_key = {
    k: filter_highconf(mt) 
    for k, mt in mt_by_key.items()
}

In [None]:
{k: mt.show(1) for k, mt in mt_by_key.items()}

In [6]:
{k: mt.count() for k, mt in mt_by_key.items()}

2021-09-23 03:47:39 Hail: INFO: Coerced sorted dataset
2021-09-23 03:48:13 Hail: INFO: Coerced sorted dataset
2021-09-23 03:51:24 Hail: INFO: Coerced sorted dataset
2021-09-23 03:52:03 Hail: INFO: Coerced sorted dataset
2021-09-23 03:55:12 Hail: INFO: Coerced sorted dataset
2021-09-23 03:55:50 Hail: INFO: Coerced sorted dataset
2021-09-23 03:58:18 Hail: INFO: Coerced sorted dataset
2021-09-23 03:58:18 Hail: INFO: Coerced sorted dataset


{'from_broad_gvcf': (14668414, 1),
 'from_cram': (14668278, 1),
 'realign': (14425591, 1),
 'hc_realign': (0, 1)}

In [18]:
truth_mt_path = (
    'gs://cpg-reference/validation/giab/truth/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt/'
)
truth_mt = hl.read_matrix_table(truth_mt_path)
truth_mt = truth_mt.key_cols_by(s = truth_mt.s.replace('HG001', TRUTH_SAMPLE))
truth_mt = filter_highconf(truth_mt)
truth_mt = hl.split_multi(truth_mt)

In [29]:
truth_mt.cols().show()

str
"""NA12878"""


In [5]:
{k: mt.count() for k, mt in mt_by_key.items()}, truth_mt.count()

2021-09-23 03:32:52 Hail: INFO: Coerced sorted dataset
2021-09-23 03:33:06 Hail: INFO: Coerced sorted dataset
2021-09-23 03:34:20 Hail: INFO: Coerced sorted dataset
2021-09-23 03:34:36 Hail: INFO: Coerced sorted dataset
2021-09-23 03:35:50 Hail: INFO: Coerced sorted dataset
2021-09-23 03:36:04 Hail: INFO: Coerced sorted dataset
2021-09-23 03:37:01 Hail: INFO: Coerced sorted dataset
2021-09-23 03:37:01 Hail: INFO: Coerced sorted dataset


({'from_broad_gvcf': (14668414, 1),
  'from_cram': (14668278, 1),
  'realign': (14425591, 1),
  'hc_realign': (0, 1)},
 (3577097, 1))

In [None]:
hl.filter_intervals(truth_mt, [hl.parse_locus_interval('chr5:1917500-1917600')]).show()

In [30]:
summary, sample_conc_ht, sites_conc_ht = hl.concordance(mt_by_key['from_broad_gvcf'], truth_mt)
print_conc_summary(summary)

2021-09-23 05:35:31 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-09-23 05:35:31 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 9761, 7472], [230, 0, 0, 20, 2], [7439147, 0, 0, 0, 0], [2249523, 0, 0, 2128827, 5846], [1419650, 0, 0, 886, 1424283]]
Left = called: 14668162
Right = truth: 3577097
Unique to left = FP [precision]: 11115052 [24.22%]
Unique to right = FN [recall]: 23987 [99.33%]
Concordant = TP [discordant]: 3553110 [6732]


2021-09-23 06:01:07 Hail: INFO: concordance: total concordance 99.81%


In [31]:
summary_from_cram, sample_conc_from_cram_ht, sites_conc_from_cram_ht = hl.concordance(mt_by_key['from_cram'], truth_mt)
print_conc_summary(summary_from_cram)

2021-09-23 06:01:09 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-09-23 06:01:09 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 9760, 7473], [5, 0, 0, 0, 1], [7439117, 0, 0, 0, 0], [2249625, 0, 0, 2128847, 5826], [1419667, 0, 0, 887, 1424303]]
Left = called: 14668272
Right = truth: 3577097
Unique to left = FP [precision]: 11115122 [24.22%]
Unique to right = FN [recall]: 23947 [99.33%]
Concordant = TP [discordant]: 3553150 [6713]


2021-09-23 06:06:57 Hail: INFO: concordance: total concordance 99.81%


In [25]:
mt_by_key['hc_realign'].cols().show()
'NA12878-realign'

str
"""NA12878-hc-from-bam"""


'NA12878-realign'

In [24]:
mt_by_key['realign'] = mt_by_key['realign'].key_cols_by(s = mt_by_key['realign'].s.replace('NA12878-realign', TRUTH_SAMPLE))
summary_realign, sample_conc_realign_ht, sites_conc_realign_ht = hl.concordance(mt_by_key['realign'], truth_mt)
print_conc_summary(summary_realign)

2021-09-23 11:30:28 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-09-23 11:30:28 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 47400, 30632], [5, 0, 0, 0, 1], [7340257, 0, 0, 0, 0], [2188999, 0, 0, 2090667, 5656], [1397265, 0, 0, 1427, 1401314]]
Left = called: 14425585
Right = truth: 3577097
Unique to left = FP [precision]: 10933604 [24.21%]
Unique to right = FN [recall]: 85116 [97.62%]
Concordant = TP [discordant]: 3491981 [7083]


2021-09-23 11:38:53 Hail: INFO: concordance: total concordance 99.80%


In [26]:
mt_by_key['hc_realign'] = mt_by_key['hc_realign'].key_cols_by(s = mt_by_key['hc_realign'].s.replace(mt_by_key['hc_realign'].s.collect()[0], TRUTH_SAMPLE))
summary_realign_hc, sample_conc_realign_hc_ht, sites_conc_realign_hc_ht = hl.concordance(mt_by_key['hc_realign'], truth_mt)
print_conc_summary(summary_realign_hc)

2021-09-23 11:38:55 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-09-23 11:38:55 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 42762, 25937], [4, 0, 0, 0, 0], [4768219, 0, 0, 0, 0], [2161416, 0, 0, 2095253, 2043], [1392993, 0, 0, 1479, 1409623]]
Left = called: 11831026
Right = truth: 3577097
Unique to left = FP [precision]: 8326150 [29.62%]
Unique to right = FN [recall]: 72221 [97.98%]
Concordant = TP [discordant]: 3504876 [3522]


2021-09-23 11:41:30 Hail: INFO: concordance: total concordance 99.90%


In [20]:
def run_conc(key):
    mt = mt_by_key[key]
    print(f'{key}: running concordance')
    summary, sample_conc_ht, sites_conc_ht = hl.concordance(mt, truth_mt)
    print_conc_summary(summary)
    print(f'{key}: writing summary')
    with hl.hadoop_open(f'{work_bucket}/results/summary_{key}.pickle', 'w') as f:
        f.write(str(summary))
    print(f'{key}: writing sample_conc_ht')
    sample_conc_ht = sample_conc_ht.checkpoint(f'{work_bucket}/results/sample_conc_{key}.ht')
    print(f'{key}: writing sites_conc_ht')
    sites_conc_ht = sites_conc_ht.checkpoint(f'{work_bucket}/results/sites_conc_{key}.ht')

In [21]:
for k in mt_by_key:
    run_conc(k)
    
gnomad_mt = gnomad_mt.key_cols_by(s = gnomad_mt.s.replace('gnomad_sn', 'NA12878'))


from_broad_gvcf: running concordance


2021-09-23 04:54:30 Hail: INFO: concordance: including 0 shared samples (1 total on left, 1 total on right)
2021-09-23 04:54:30 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]


2021-09-23 04:56:56 Hail: INFO: concordance: total concordance nan%


ZeroDivisionError: division by zero

In [35]:
print(f'{work_bucket}/results')

gs://cpg-tob-wgs-test-analysis/jupyter/vsavelyev/results


# From fewgenomes joint-calling MT

### Reading the fewgenomes matrix table of 122 samples

In [27]:
mt = utils.get_mt(
    fewgenomes_v2_raw_mt_path, 
    split=True, 
    add_meta=True, 
    meta_ht=hl.read_table(meta_v2_ht_path)
)

### Subsetting to NA12878 - 5m variants

In [28]:
mt = mt.filter_cols(hl.literal([TRUTH_SAMPLE]).contains(mt['s']))
mt = mt.filter_rows((hl.len(mt.alleles) > 1) & (hl.agg.any(mt.GT.is_non_ref())))
mt.count()

(5139649, 1)

### Annotating with VQSR

In [30]:
vqsr_ht = hl.read_table(vqsr_final_filter_ht_path)
mt = mt.annotate_rows(**vqsr_ht[mt.row_key])
mt = mt.annotate_globals(**vqsr_ht.index_globals())

In [33]:
mt.entries().show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Unnamed: 44_level_0,Unnamed: 45_level_0,Unnamed: 46_level_0,Unnamed: 47_level_0,Unnamed: 48_level_0,Unnamed: 49_level_0,Unnamed: 50_level_0,Unnamed: 51_level_0,Unnamed: 52_level_0,Unnamed: 53_level_0,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,meta,Unnamed: 122_level_0,Unnamed: 123_level_0,Unnamed: 124_level_0,Unnamed: 125_level_0,Unnamed: 126_level_0,Unnamed: 127_level_0,Unnamed: 128_level_0,Unnamed: 129_level_0,Unnamed: 130_level_0,Unnamed: 131_level_0,Unnamed: 132_level_0,Unnamed: 133_level_0,Unnamed: 134_level_0,Unnamed: 135_level_0,Unnamed: 136_level_0,Unnamed: 137_level_0,Unnamed: 138_level_0,Unnamed: 139_level_0,Unnamed: 140_level_0,Unnamed: 141_level_0,Unnamed: 142_level_0,Unnamed: 143_level_0,Unnamed: 144_level_0,Unnamed: 145_level_0,Unnamed: 146_level_0,Unnamed: 147_level_0,Unnamed: 148_level_0,Unnamed: 149_level_0,Unnamed: 150_level_0,Unnamed: 151_level_0,Unnamed: 152_level_0,Unnamed: 153_level_0,Unnamed: 154_level_0,Unnamed: 155_level_0,Unnamed: 156_level_0,Unnamed: 157_level_0,Unnamed: 158_level_0,Unnamed: 159_level_0,Unnamed: 160_level_0,Unnamed: 161_level_0,Unnamed: 162_level_0,Unnamed: 163_level_0,Unnamed: 164_level_0,Unnamed: 165_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,vqsr,vqsr,vqsr,vqsr,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,impute_sex_stats,impute_sex_stats,impute_sex_stats,impute_sex_stats,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,gvcf_info,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1
locus,alleles,rsid,n_unsplit_alleles,mixed_site,a_index,was_split,qual,filters,allele_type,AS_MQRankSum,AS_pab_max,AS_QD,AS_ReadPosRankSum,AS_SOR,n_alt_alleles,variant_type,hapmap,omni,mills,kgp_phase1_hc,fail_hard_filters,transmitted_singleton,singleton,ac_raw,ac,ac_qc_samples_unrelated_raw,positive_train_site,negative_train_site,non_lcr,non_lcr_singleton_bin,non_lcr_biallelic_bin,bin,non_lcr_biallelic_singleton_bin,biallelic_bin,non_lcr_bin,singleton_bin,biallelic_singleton_bin,monoallelic,AS_VQSLOD,raw_non_lcr_singleton_bin,raw_non_lcr_biallelic_bin,raw_bin,raw_non_lcr_biallelic_singleton_bin,raw_biallelic_bin,raw_non_lcr_bin,raw_singleton_bin,raw_biallelic_singleton_bin,AS_VQSLOD,AS_culprit,NEGATIVE_TRAIN_SITE,POSITIVE_TRAIN_SITE,SOR,s,is_female,chr20_mean_dp,chrX_mean_dp,chrY_mean_dp,chrX_ploidy,chrY_ploidy,X_karyotype,Y_karyotype,sex_karyotype,f_stat,n_called,expected_homs,observed_homs,n_filtered,n_hom_ref,n_het,n_hom_var,n_non_ref,n_singleton,n_snp,n_insertion,n_deletion,n_transition,n_transversion,n_star,r_ti_tv,r_het_hom_var,r_insertion_deletion,nongnomad_snps,hard_filters,n_snp_residual,n_singleton_residual,r_ti_tv_residual,r_insertion_deletion_residual,n_insertion_residual,n_deletion_residual,r_het_hom_var_residual,n_het_residual,n_hom_var_residual,n_transition_residual,n_transversion_residual,fail_n_snp_residual,fail_n_singleton_residual,fail_r_ti_tv_residual,fail_r_insertion_deletion_residual,fail_n_insertion_residual,fail_n_deletion_residual,fail_r_het_hom_var_residual,fail_n_het_residual,fail_n_hom_var_residual,fail_n_transition_residual,fail_n_transversion_residual,qc_metrics_filters,training_pop,pca_scores,pop,prob_EUR,population,gvcf,r_contamination,r_chimera,r_duplication,median_insert_size,related_before_qc,related,release_filters,high_quality,release,RGQ,END,AC,AF,AN,AS_BaseQRankSum,AS_FS,AS_InbreedingCoeff,AS_MQ,AS_MQRankSum,AS_QD,AS_QUALapprox,AS_ReadPosRankSum,AS_SOR,AS_VarDP,BaseQRankSum,ClippingRankSum,ExcessHet,FS,InbreedingCoeff,MQ,MQ0,MQRankSum,MQ_DP,QD,QUALapprox,RAW_GT_COUNT,RAW_MQ,RAW_MQandDP,ReadPosRankSum,SOR,VarDP,VariantType,AB,DP,GQ,MIN_DP,MQ0,PID,SB,GT,PGT,AD,PL
locus<GRCh38>,array<str>,str,int32,bool,int32,bool,float64,set<str>,str,float64,float64,float32,float64,float64,int32,str,bool,bool,bool,bool,bool,bool,bool,int64,int64,int64,bool,bool,bool,int32,int32,int32,int32,int32,int32,int32,int32,bool,float64,int32,int32,int32,int32,int32,int32,int32,int32,float64,str,bool,bool,float64,str,bool,float32,float32,float32,float32,float32,str,str,str,float64,int64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,float64,int64,set<str>,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,set<str>,str,array<float64>,str,float64,str,str,float64,float64,float64,float64,bool,bool,set<str>,bool,bool,int32,int32,array<int32>,array<float64>,int32,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<float64>,array<int32>,array<float64>,array<float64>,array<int32>,float64,float64,float64,float64,float64,float64,int32,float64,int32,float64,int32,array<int32>,float64,array<int32>,float64,float64,int32,str,float64,int32,int32,int32,int32,str,array<int32>,call,call,array<int32>,array<int32>
chr1:10327,"[""T"",""C""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",1.07,1.0,8.68,-0.72,0.0884,1,"""snv""",,,,,False,,False,11,2,10,False,False,False,,,99,,99,,,,False,-49.7,,,99,,99,,,,-49.7,"""AS_MQ""",False,False,0.134,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,50,,,,,,,,,,,"[NA,46]",,,"[2,2]",1.03,,,,,,,1.03,9,,46,"[0,1,0]",5780.0,"[5780,9]",0.736,,4,,0.5,4,16,,,,"[0,2,0,2]",0/1,,"[2,2]","[46,0,16]"
chr1:10439,"[""AC"",""A""]",,2,False,1,False,-10.0,"{""VQSR""}","""del""",-0.925,1.0,13.8,0.358,1.43,1,"""indel""",,,,,False,,False,35,5,33,False,False,False,,,87,,88,,,,False,-0.633,,,87,,87,,,,-0.633,"""AS_FS""",False,False,1.45,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,94,,,,,,,,,,,"[NA,90]",,,"[2,4]",-1.38,,,,,,,0.72,9,,90,"[0,1,0]",11800.0,"[11804,9]",-1.38,,6,,,6,22,,,,"[1,1,1,3]",0/1,,"[2,4]","[90,0,22]"
chr1:10492,"[""C"",""T""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-0.185,1.0,9.7,0.212,0.642,1,"""snv""",,,,,False,,False,19,5,19,False,False,False,,,97,,97,,,,False,-24.3,,,97,,97,,,,-24.3,"""AS_MQ""",False,False,0.642,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,107,,,,,,,,,,,"[NA,95]",,,"[4,4]",-1.64,,,,,,,-2.02,9,,95,"[0,1,0]",14000.0,"[13962,9]",2.02,,8,,0.5,8,95,,,,"[3,1,2,2]",0/1,,"[4,4]","[95,0,115]"
chr1:13813,"[""T"",""G""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-2.04,1.0,7.59,0.143,6.7,1,"""snv""",,,,,True,,False,60,16,60,False,False,False,,,100,,100,,,,False,-127.0,,,100,,100,,,,-127.0,"""AS_MQ""",False,False,6.7,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,117,,,,,,,,,,,"[NA,117]",,,"[0,2]",,,,,,,,,2,,117,"[0,0,1]",1010.0,"[1013,2]",,,2,,,2,9,,,"""13813_T_G""","[0,0,0,2]",1/1,0|1,"[0,2]","[117,9,0]"
chr1:13838,"[""C"",""T""]",,2,False,1,False,-10.0,"{""InbreedingCoeff"",""VQSR""}","""snv""",-2.28,1.0,6.42,0.271,8.12,1,"""snv""",,,,,True,,False,69,23,69,False,False,False,,,100,,100,,,,False,-159.0,,,100,,100,,,,-159.0,"""AS_MQ""",False,False,8.12,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,117,,,,,,,,,,,"[NA,117]",,,"[0,3]",,,,,,,,,3,,117,"[0,0,1]",1500.0,"[1497,3]",,,3,,,3,9,,,"""13813_T_G""","[0,0,0,3]",1/1,0|1,"[0,3]","[117,9,0]"
chr1:13912,"[""G"",""A""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-1.92,1.0,5.4,0.72,5.18,1,"""snv""",,,,,True,,False,22,2,22,False,False,False,,,100,,100,,,,False,-160.0,,,100,,100,,,,-160.0,"""AS_MQ""",False,False,5.18,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,53,,,,,,,,,,,"[NA,53]",,,"[0,2]",,,,,,,,,2,,53,"[0,0,1]",968.0,"[968,2]",,,2,,,2,6,,,,"[0,0,0,2]",1/1,,"[0,2]","[53,6,0]"
chr1:14248,"[""T"",""G""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-0.674,1.0,4.04,0.406,6.42,1,"""snv""",,,,,True,,False,58,17,57,False,False,False,,,100,,100,,,,False,-144.0,,,100,,100,,,,-144.0,"""AS_MQ""",False,False,7.82,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,77,,,,,,,,,,,"[NA,74]",,,"[1,3]",-0.727,,,,,,,0.727,4,,74,"[0,1,0]",2260.0,"[2263,4]",0.727,,4,,0.25,4,16,,,,"[1,0,3,0]",0/1,,"[1,3]","[74,0,16]"
chr1:14354,"[""C"",""A""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-0.922,1.0,2.98,0.778,6.94,1,"""snv""",,,,,True,,False,45,13,44,False,False,False,,,100,,100,,,,False,-170.0,,,100,,100,,,,-170.0,"""AS_MQ""",False,False,8.28,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,59,,,,,,,,,,,"[NA,50]",,,"[3,3]",0.406,,,,,,,-0.406,6,,50,"[0,1,0]",4050.0,"[4051,6]",0.406,,6,,0.5,6,50,,,,"[2,1,3,0]",0/1,,"[3,3]","[50,0,78]"
chr1:14599,"[""T"",""A""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-2.49,0.687,4.28,0.72,2.2,1,"""snv""",,,,,True,,False,22,3,22,False,False,False,,,99,,100,,,,False,-79.6,,,99,,100,,,,-79.6,"""AS_MQ""",False,False,2.2,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,84,,,,,,,,,,,"[NA,60]",,,"[8,2]",0.48,,,,,,,-1.73,10,,60,"[0,1,0]",5830.0,"[5827,10]",-1.27,,10,,0.8,10,60,,,"""14599_T_A""","[8,0,1,1]",0/1,0|1,"[8,2]","[60,0,330]"
chr1:14604,"[""A"",""G""]",,2,False,1,False,-10.0,"{""VQSR""}","""snv""",-2.69,0.687,4.73,0.404,1.86,1,"""snv""",,,,,True,,False,22,3,22,False,False,False,,,99,,100,,,,False,-77.8,,,99,,100,,,,-77.8,"""AS_MQ""",False,False,1.86,"""NA12878""",True,33.1,30.1,3.92,1.82,0.237,"""XX""","""""","""XX""",-1.0,148733,114000.0,80074,24061470,1098610,2464997,1452739,3917736,19500,4903989,198903,267583,3263007,1640982,0,1.99,1.7,0.743,6148559,{},19300.0,-55100.0,0.00481,0.000916,535.0,384.0,-0.0336,-21500.0,20900.0,15500.0,3790.0,False,True,False,False,False,False,False,False,False,False,False,"{""n_singleton_residual""}","""EUR""","[-8.47e-02,-2.01e-03,-1.02e-02,-1.20e-02,8.05e-04,-6.64e-03,3.47e-03,1.89e-03,-1.15e-03,1.63e-03,-1.31e-03,3.07e-04,-1.07e-03,6.55e-03,4.98e-03,-5.14e-04,-2.42e-03,-3.37e-03,5.60e-03,-2.88e-03,-1.34e-03,-3.60e-04,6.70e-03,-1.61e-03,4.44e-04,-5.94e-03,2.28e-02,7.88e-02,-3.30e-03,1.69e-02]","""EUR""",1.0,"""EUR""","""gs://cpg-fewgenomes-main-tmp/joint-calling/v2/combiner/gvcf/NA12878.g.vcf.gz""",,,,,False,False,"{""n_singleton_residual""}",True,False,84,,,,,,,,,,,"[NA,60]",,,"[8,2]",-0.48,,,,,,,-1.73,10,,60,"[0,1,0]",5830.0,"[5827,10]",-1.27,,10,,0.8,10,60,,,"""14599_T_A""","[8,0,1,1]",0/1,0|1,"[8,2]","[60,0,330]"


### Subsetting to passing variants (5.1m down to 4.3m)

In [48]:
mtf = mt.filter_rows(mt.filters.length() == 0)
mtf.rows().count() / mt.rows().count()

0.8599509421752342

In [35]:
mtf.rows().count()

(4419846, 1)

### Subsetting to chr21 for speed

In [20]:
mt21 = hl.filter_intervals(mt, [hl.parse_locus_interval('chr21')])
mt21f = mt21.filter_rows(mt21.filters.length() == 0)

In [50]:
mt21.rows().count(), mt21f.rows().count(), mt21f.rows().count() / mt21.rows().count()

((90102, 1), (62536, 1), 0.6940578455528179)

### Subsetting to the NA12878 high confidence regions

In [23]:
mt_hc = filter_highconf(mt)
mtf_hc = filter_highconf(mtf)
mt21_hc = filter_highconf(mt21)
mt21f_hc = filter_highconf(mt21f)

2021-07-26 03:14:27 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-07-26 03:14:27 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-07-26 03:14:28 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-07-26 03:14:29 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)


### Saving to the disk to faster reruns

In [70]:
mt_path = join(work_bucket, 'mt.mt')
mtf_path = join(work_bucket, 'mtf.mt')
mt21_path = join(work_bucket, 'mt21.mt')
mt21f_path = join(work_bucket, 'mt21f.mt')
mt_hc_path = join(work_bucket, 'mt_hc.mt')
mtf_hc_path = join(work_bucket, 'mtf_hc.mt')
mt21_hc_path = join(work_bucket, 'mt21_hc.mt')
mt21f_hc_path = join(work_bucket, 'mt21f_hc.mt')
# mt.write(mt_path, overwrite=True)
# mtf.write(mtf_path, overwrite=True)
# mt21.write(mt21_path, overwrite=True)
# mt21f.write(mt21f_path, overwrite=True)
# mt_hc.write(mt_hc_path, overwrite=True)
# mtf_hc.write(mtf_hc_path, overwrite=True)
# mt21_hc.write(mt21_hc_path, overwrite=True)
# mt21f_hc.write(mt21f_hc_path, overwrite=True)
mt = hl.read_matrix_table(mt_path)
mtf = hl.read_matrix_table(mtf_path)
mt21 = hl.read_matrix_table(mt21_path)
mt21f = hl.read_matrix_table(mt21f_path)
mt_hc = hl.read_matrix_table(mt_hc_path)
mtf_hc = hl.read_matrix_table(mtf_hc_path)
mt21_hc = hl.read_matrix_table(mt21_hc_path)
mt21f_hc = hl.read_matrix_table(mt21f_hc_path)

### Exploring % of filtered and % of HC

There are ~30% of variants outside of the HC regions (5.1m > 3.6m), 
or ~22% (4.3m > 3.3m) for filtered variants only.

In [25]:
mt_hc.rows().count() / mt.rows().count(), \
mtf_hc.rows().count() / mtf.rows().count()

2021-07-26 03:18:16 Hail: INFO: Coerced sorted dataset
2021-07-26 03:18:46 Hail: INFO: Coerced sorted dataset


(0.7008891074079183, 0.7859941273971989)

Percentages are a bit difference for chr21:

In [24]:
mt21_hc.rows().count() / mt21.rows().count(), mt21f_hc.rows().count() / mt21f.rows().count()

2021-07-26 03:15:12 Hail: INFO: Coerced sorted dataset
2021-07-26 03:15:19 Hail: INFO: Coerced sorted dataset


(0.5759694568378061, 0.8136913138032493)

Plotting the filtered variants % per contig

In [31]:
ref = hl.get_reference('GRCh38')
contigs = [c for c in ref.contigs if len(c) <= 5]

In [32]:
total_per_contig = [hl.filter_intervals(mt_hc, [hl.parse_locus_interval(c)]).rows().count() for c in contigs]
filtered_per_contig = [hl.filter_intervals(mtf_hc, [hl.parse_locus_interval(c)]).rows().count() for c in contigs]
gn_total_per_contig = [hl.filter_intervals(gnomad_hc_mt, [hl.parse_locus_interval(c)]).rows().count() for c in contigs]
gn_filtered_per_contig = [hl.filter_intervals(gnomad_hc_mtf, [hl.parse_locus_interval(c)]).rows().count() for c in contigs]

In [59]:
p = figure(x_range=contigs)
p.vbar(x=contigs, top=[
    (float(f) / float(t) * total_per_contig[0]) if t else 0 
    for t, f in zip(total_per_contig, filtered_per_contig)], 
       width=1, color='black', fill_color='white')
p.vbar(x=contigs, top=total_per_contig, width=1, color='blue')
p.vbar(x=contigs, top=filtered_per_contig, width=1, color='red')
reset_output()
output_notebook()
show(p)

In [61]:
p = figure(x_range=contigs)
p.vbar(x=contigs, top=[
    (float(f) / float(t) * gn_total_per_contig[0]) if t else 0 
    for t, f in zip(gn_total_per_contig, gn_filtered_per_contig)], 
       width=1, color='black', fill_color='white')
p.vbar(x=contigs, top=gn_total_per_contig, width=1, color='blue')
p.vbar(x=contigs, top=gn_filtered_per_contig, width=1, color='red')
reset_output()
output_notebook()
show(p)

### Exploring AS_VQSLOD
AS_VQSLOD goes into very negative values

In [19]:
reset_output()
output_notebook()
show(hl.plot.histogram(mt21.aggregate_entries(hl.expr.aggregators.hist(mt21.AS_VQSLOD, -80, 20, 100))))

In [39]:
mt21.aggregate_rows(hl.agg.min(mt21.AS_VQSLOD))

-33583.5947

Now after removing the filtered variants. It's evident that filtering is performed on a certain threshold of the score derived from AS_VQSLOD:

In [20]:
show(hl.plot.histogram(mt21f.aggregate_entries(hl.expr.aggregators.hist(mt21f.AS_VQSLOD, -80, 20, 100))))

### Preprare the truth sample

In [9]:
truth_mt_path = (
    'gs://gnomad-public/resources/grch38/na12878/'
    'HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1'
    '-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt'
)
truth_mt = hl.read_matrix_table(truth_mt_path)
# truth_mt = truth_mt.key_cols_by(s=hl.str(TRUTH_SAMPLE))
# truth_mt = hl.split_multi_hts(truth_mt, left_aligned=False)
# truth_mt = truth_mt.filter_rows(hl.agg.any(truth_mt.GT.is_non_ref()))
truth_mt.count()

(3659369, 1)

In [10]:
truth_mt21 = hl.filter_intervals(truth_mt, [hl.parse_locus_interval('chr21')])

The truth sample variants sitting almost entirely in the HC regions, however 2.5% are outside:

In [31]:
truth_mt_hc = filter_highconf(truth_mt)
truth_mt_hc.rows().count() / truth_mt.rows().count()

2021-07-26 03:48:36 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-07-26 03:48:39 Hail: INFO: Coerced sorted dataset


0.9776527046056301

In [30]:
truth_mt21_hc = filter_highconf(truth_mt21)
truth_mt21_hc.rows().count() / truth_mt21.rows().count()

2021-07-26 03:48:04 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
2021-07-26 03:48:07 Hail: INFO: Coerced sorted dataset


0.9752576931049428

Saving on disk

In [4]:
truth_mt_path = join(work_bucket, 'truth_mt_hc.mt')
truth_mt21_path = join(work_bucket, 'truth_mt21_hc.mt')
# truth_mt.write(truth_mt_path)
# truth_mt21.write(truth_mt21_path)
truth_mt = hl.read_matrix_table(truth_mt_path)
truth_mt21 = hl.read_matrix_table(truth_mt21_path)

### Evaluation: concordance with truth

Our matrix table against the truth showed the concodrance of 99.98%, with a bit higher 2% FN rate:

In [6]:
summary21, sample_conc_21ht, sites_conc_21ht = hl.concordance(mt21, truth_mt21)
print_conc_summary(summary21)

2021-07-26 15:04:44 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-26 15:04:44 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 865, 533], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [194, 0, 0, 31564, 5], [33, 0, 0, 6, 20094]]
Left = called: 51896
Right = truth: 53067
Unique to left = FP [precision]: 238 [99.54%]
Unique to right = FN [recall]: 1409 [97.34%]
Concordant = TP [discordant]: 51658 [11]


2021-07-26 15:05:06 Hail: INFO: concordance: total concordance 99.98%
2021-07-26 15:05:06 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [None]:
mt_vs_truthmt_summary, sample_concordance_ht, sites_concordance_ht = hl.concordance(mt, truth_mt)
print_conc_summary(mt_vs_truthmt_summary)

### Evaluation: concordnace after filtering to PASS variants

Higher precision, but even lower recall

In [34]:
summary21f, _, _ = hl.concordance(mt21f_hc, truth_mt21)
print_conc_summary(summary21f)

2021-07-26 11:35:55 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-26 11:35:55 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 1386, 837], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [16, 0, 0, 31044, 5], [25, 0, 0, 5, 19790]]
Left = called: 50885
Right = truth: 53067
Unique to left = FP [precision]: 51 [99.90%]
Unique to right = FN [recall]: 2233 [95.79%]
Concordant = TP [discordant]: 50834 [10]


2021-07-26 11:36:00 Hail: INFO: concordance: total concordance 99.98%


In [None]:
mt_vs_truthmt_summaryf, _, _ = hl.concordance(mtf_hc, truth_mt)
print_conc_summary(mt_vs_truthmt_summaryf)

### Compare with gnomad mt

In [32]:
gnomad_mt_path = 'gs://gcp-public-data--gnomad/release/3.1/mt/genomes/gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt/'
gnomad_mt = hl.read_matrix_table(gnomad_mt_path)
gnomad_mt.describe()

----------------------------------------
Global fields:
    'global_annotation_descriptions': struct {
        sex_imputation_ploidy_cutoffs: struct {
            Description: str
        }, 
        population_inference_pca_metrics: struct {
            Description: str
        }, 
        hard_filter_cutoffs: struct {
            Description: str
        }, 
        cohort_freq_meta: struct {
            Description: str
        }, 
        gnomad_freq_meta: struct {
            Description: str
        }, 
        cohort_freq_index_dict: struct {
            Description: str
        }, 
        gnomad_freq_index_dict: struct {
            Description: str
        }, 
        gnomad_faf_index_dict: struct {
            Description: str
        }, 
        gnomad_faf_meta: struct {
            Description: str
        }, 
        vep_version: struct {
            Description: str
        }, 
        vep_csq_header: struct {
            Description: str
        }, 
        dbsnp_versio

In [None]:
gnomad_sn = 'v3.1::NA12878'
gnomad_mt = gnomad_mt.key_cols_by(s = gnomad_mt.s.replace(gnomad_sn, TRUTH_SAMPLE))
gnomad_mt = gnomad_mt.filter_cols(hl.literal([TRUTH_SAMPLE]).contains(gnomad_mt['s']))
gnomad_mt = gnomad_mt.filter_rows((hl.len(gnomad_mt.alleles) > 1) & (hl.agg.any(gnomad_mt.GT.is_non_ref())))
gnomad_mt.count()

Subset to PASSing

In [5]:
gnomad_mtf = gnomad_mt.filter_rows(gnomad_mt.filters.length() == 0)
gnomad_mt21 = hl.filter_intervals(gnomad_mt, [hl.parse_locus_interval('chr21')])
gnomad_mt21f = gnomad_mt21.filter_rows(gnomad_mt21.filters.length() == 0)

Are there multiallelics? - no

In [None]:
#gnomad_mt.filter_rows(hl.len(gnomad_mt.alleles) > 2).rows().count()

### Subset to highconf regions

In [None]:
gnomad_hc_mt = filter_highconf(gnomad_mt)
gnomad_hc_mtf =  gnomad_hc_mt.filter_rows(gnomad_hc_mt.filters.length() == 0)
gnomad_hc_mt21 = hl.filter_intervals(gnomad_hc_mt, [hl.parse_locus_interval('chr21')])
gnomad_hc_mt21f = gnomad_mt21_hc.filter_rows(gnomad_hc_mt21.filters.length() == 0)

Saving on disk

In [20]:
gnomad_mt_path = join(work_bucket, 'gnomad.mt')
gnomad_mtf_path = join(work_bucket, 'gnomad.filt.mt')
gnomad_mt21_path = join(work_bucket, 'gnomad.chr21.mt')
gnomad_mt21f_path = join(work_bucket, 'gnomad.chr21.filt.mt')
gnomad_hc_mt_path = join(work_bucket, 'gnomad_hc.mt')
gnomad_hc_mtf_path = join(work_bucket, 'gnomad_hc.filt.mt')
gnomad_hc_mt21_path = join(work_bucket, 'gnomad_hc.chr21.mt')
gnomad_hc_mt21f_path = join(work_bucket, 'gnomad_hc.chr21.filt.mt')
# gnomad_mt.write(gnomad_mt_path, overwrite=True)
# gnomad_mtf.write(gnomad_mtf_path, overwrite=True)
# gnomad_mt21.write(gnomad_mt21_path, overwrite=True)
# gnomad_mt21f.write(gnomad_mt21f_path, overwrite=True)
# gnomad_mt_hc.write(gnomad_hc_mt_path, overwrite=True)
# gnomad_mtf_hc.write(gnomad_hc_mtf_path, overwrite=True)
# gnomad_mt21_hc.write(gnomad_hc_mt21_path, overwrite=True)
# gnomad_mt21f_hc.write(gnomad_hc_mt21f_path, overwrite=True)
gnomad_mt = hl.read_matrix_table(gnomad_mt_path)
gnomad_mtf = hl.read_matrix_table(gnomad_mtf_path)
gnomad_mt21 = hl.read_matrix_table(gnomad_mt21_path)
gnomad_mt21f = hl.read_matrix_table(gnomad_mt21f_path)
gnomad_hc_mt = hl.read_matrix_table(gnomad_hc_mt_path)
gnomad_hc_mtf = hl.read_matrix_table(gnomad_hc_mtf_path)
gnomad_hc_mt21 = hl.read_matrix_table(gnomad_hc_mt21_path)
gnomad_hc_mt21f = hl.read_matrix_table(gnomad_hc_mt21f_path)

92% of gnomAD variants are PASSing, with 99% in the HC regions.

In [25]:
gnomad_mtf.rows().count(), gnomad_mt.rows().count(), \
gnomad_mtf.rows().count() / gnomad_mt.rows().count(), \
gnomad_hc_mtf.rows().count(), gnomad_hc_mt.rows().count(), \
gnomad_hc_mtf.rows().count() / gnomad_hc_mt.rows().count()

(4640374, 4995085, 0.9289879951992809, 3559009, 3597730, 0.9892373802369827)

75% though in chr21:

In [28]:
gnomad_mt21f.rows().count(), gnomad_mt21.rows().count(), \
gnomad_mt21f.rows().count() / gnomad_mt21.rows().count(), \
gnomad_hc_mt21f.rows().count(), gnomad_hc_mt21.rows().count(), \
gnomad_hc_mt21f.rows().count() / gnomad_hc_mt21.rows().count()

(67677, 89533, 0.7558888901298962, 51537, 51852, 0.9939250173570933)

### Exploring AS_VQSLOD in Gnomad MT

In [36]:
reset_output()
output_notebook()
show(hl.plot.histogram(gnomad_mt21.aggregate_entries(hl.expr.aggregators.hist(gnomad_mt21.vqsr.AS_VQSLOD, -80, 30, 100))))

In [37]:
show(hl.plot.histogram(gnomad_mt21f.aggregate_entries(hl.expr.aggregators.hist(gnomad_mt21f.vqsr.AS_VQSLOD, -80, 30, 100))))

### Evaluation: gnomad MT vs truth MT

In [8]:
gn21_summary, gn_sample_conc_21ht, gn_sites_conc_21ht = hl.concordance(gnomad_hc_mt21, truth_mt21)
print_conc_summary(gn21_summary)

2021-07-26 15:06:16 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-26 15:06:16 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 870, 533], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [155, 0, 0, 31558, 5], [33, 0, 0, 7, 20094]]
Left = called: 51852
Right = truth: 53067
Unique to left = FP [precision]: 200 [99.61%]
Unique to right = FN [recall]: 1415 [97.33%]
Concordant = TP [discordant]: 51652 [12]


2021-07-26 15:06:26 Hail: INFO: concordance: total concordance 99.98%


In [None]:
gn_summary, gn_sample_concordance_ht, gn_sites_concordance_ht = hl.concordance(gnomad_hc_mt, truth_mt)
print_conc_summary(gn_summary)

In [41]:
gnomad_mt21f.cols().show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,sex_imputation,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0,Unnamed: 33_level_0,Unnamed: 34_level_0,Unnamed: 35_level_0,Unnamed: 36_level_0,Unnamed: 37_level_0,Unnamed: 38_level_0,Unnamed: 39_level_0,Unnamed: 40_level_0,Unnamed: 41_level_0,Unnamed: 42_level_0,Unnamed: 43_level_0,Unnamed: 44_level_0,Unnamed: 45_level_0,Unnamed: 46_level_0,Unnamed: 47_level_0,Unnamed: 48_level_0
Unnamed: 0_level_1,bam_metrics,bam_metrics,bam_metrics,bam_metrics,bam_metrics,bam_metrics,bam_metrics,bam_metrics,subsets,subsets,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,impute_sex_stats,impute_sex_stats,impute_sex_stats,impute_sex_stats,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,sample_qc,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,population_inference,Unnamed: 47_level_1,Unnamed: 48_level_1
s,pct_bases_20x,pct_chimeras,freemix,mean_coverage,median_coverage,mean_insert_size,median_insert_size,pct_bases_10x,tgp,hgdp,chr20_mean_dp,chrX_mean_dp,chrY_mean_dp,chrX_ploidy,chrY_ploidy,X_karyotype,Y_karyotype,sex_karyotype,f_stat,n_called,expected_homs,observed_homs,n_hom_ref,n_het,n_hom_var,n_non_ref,n_snp,n_insertion,n_deletion,n_transition,n_transversion,r_ti_tv,r_het_hom_var,r_insertion_deletion,pca_scores,pop,prob_afr,prob_ami,prob_amr,prob_asj,prob_eas,prob_fin,prob_mid,prob_nfe,prob_oth,prob_sas,labeled_subpop,gnomad_release
str,float64,float64,float64,float64,float64,float64,float64,float64,bool,bool,float32,float32,float32,float32,float32,str,str,str,float64,int64,float64,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,float64,array<float64>,str,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,str,bool
"""v3.1::NA12878""",94.7,0.683,3e-06,31.4,32.0,442.0,433.0,97.1,True,False,32.5,29.8,0.812,1.83,0.0501,"""XX""","""""","""XX""",-1.23,27087,18800.0,8538,4981521,1372005,662914,2034919,2572901,27002,97930,1883215,689686,2.73,2.07,0.276,"[1.14e-01,-3.90e-02,9.31e-03,-2.04e-02,-1.73e-02,1.60e-02,-1.02e-03,-6.69e-03,6.42e-03,-1.33e-02,7.71e-03,-1.89e-03,5.57e-03,-2.39e-03,-3.40e-04,-8.80e-04]","""nfe""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"""ceu""",False


Filtered gnomad MT vs truth

In [39]:
gnf21_summary, _, g_ = hl.concordance(gnomad_hc_mt21f, truth_mt21)
print_conc_summary(gnf21_summary)

2021-07-26 11:40:37 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-26 11:40:38 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 994, 586], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [18, 0, 0, 31435, 4], [32, 0, 0, 6, 20042]]
Left = called: 51537
Right = truth: 53067
Unique to left = FP [precision]: 60 [99.88%]
Unique to right = FN [recall]: 1590 [97.00%]
Concordant = TP [discordant]: 51477 [10]


2021-07-26 11:40:51 Hail: INFO: concordance: total concordance 99.98%


In [None]:
gnffull_truth_summary, _, _ = hl.concordance(gnomad_hc_mtf, truth_mt)
print_conc_summary(gnffull_truth_summary)

### Evaluation: filtered gnomad MT vs filtered MT

In [40]:
gnf_summary, _, _ = hl.concordance(gnomad_hc_mt21f, mt21f_hc)
print_conc_summary(gnf_summary)

2021-07-26 11:40:52 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-26 11:40:52 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'inbreeding_coeff_cutoff' -> 'inbreeding_coeff_cutoff_1'
    'filtering_model' -> 'filtering_model_1'
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'


Summary:
[[0, 0, 0, 93, 30], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [486, 0, 0, 30971, 0], [289, 0, 0, 1, 19790]]
Left = called: 51537
Right = truth: 50885
Unique to left = FP [precision]: 776 [98.49%]
Unique to right = FN [recall]: 124 [99.76%]
Concordant = TP [discordant]: 50761 [1]


2021-07-26 11:41:04 Hail: INFO: concordance: total concordance 100.00%


In [None]:
gnffull_summary, _, _ = hl.concordance(gnomad_hc_mtf, mtf_hc)
print_conc_summary(gnffull_summary)

# Binned concordance

In [71]:
_, gn_sample_conc_ht, gn_sites_conc_ht = hl.concordance(gnomad_hc_mt, truth_mt)
_, samples_conc_ht, sites_conc_ht = hl.concordance(mt_hc, truth_mt)

2021-07-27 07:35:06 Hail: INFO: concordance: including 0 shared samples (1 total on left, 1 total on right)
2021-07-27 07:35:06 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'
2021-07-27 07:37:28 Hail: INFO: concordance: total concordance nan%
2021-07-27 07:37:28 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2021-07-27 07:37:29 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-27 07:37:29 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'
2021-07-27 07:37:44 Hail: INFO: concordance: total concordance 99.96%


2021-07-27 07:37:44 Hail: INFO: concordance: including 1 shared samples (1 total on left, 1 total on right)
2021-07-27 07:37:45 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'
2021-07-27 07:38:04 Hail: INFO: concordance: total concordance 99.96%


In [73]:
sites_conc_ht_path = join(work_bucket, 'sites_concordance.ht')
samples_conc_ht_path = join(work_bucket, 'sample_concordance.ht')
gn_sites_conc_ht_path = join(work_bucket, 'gn_sites_concordance.ht')
gn_samples_conc_ht_path = join(work_bucket, 'gn_sample_concordance.ht')
sites_conc_ht.write(sites_conc_ht_path, overwrite=True)
sample_conc_ht.write(samples_conc_ht_path, overwrite=True)
gn_sites_conc_ht.write(gn_sites_conc_ht_path, overwrite=True)
gn_sample_conc_ht.write(gn_samples_conc_ht_path, overwrite=True)

2021-07-27 07:38:37 Hail: INFO: wrote table with 3687842 rows in 9718 partitions to gs://cpg-tob-wgs-test-tmp/concordance/tmp/sites_concordance.ht
    Total size: 61.72 MiB
    * Rows: 61.72 MiB
    * Globals: 235.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1979 rows (31.00 KiB)


NameError: name 'sample_conc_ht' is not defined

In [None]:
sites_conc_ht = hl.read_table(sites_conc_ht_path)
sample_conc_ht = hl.read_table(sample_conc_ht_path)
gn_sites_conc_ht = hl.read_table(gn_sites_conc_ht_path)
gn_sample_conc_ht = hl.read_table(gn_sample_conc_ht_path)

### Rank variants (sort by the score)

In [80]:
gnomad_mt21.describe()

----------------------------------------
Global fields:
    'global_annotation_descriptions': struct {
        sex_imputation_ploidy_cutoffs: struct {
            Description: str
        }, 
        population_inference_pca_metrics: struct {
            Description: str
        }, 
        hard_filter_cutoffs: struct {
            Description: str
        }, 
        cohort_freq_meta: struct {
            Description: str
        }, 
        gnomad_freq_meta: struct {
            Description: str
        }, 
        cohort_freq_index_dict: struct {
            Description: str
        }, 
        gnomad_freq_index_dict: struct {
            Description: str
        }, 
        gnomad_faf_index_dict: struct {
            Description: str
        }, 
        gnomad_faf_meta: struct {
            Description: str
        }, 
        vep_version: struct {
            Description: str
        }, 
        vep_csq_header: struct {
            Description: str
        }, 
        dbsnp_versio

In [63]:
def rank_variants(
    mt,
    model='CPG-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
):
    score_rank_ht_path = join(work_bucket, 'vqsr', model, truth_sample, 'score_rank.ht')
    if not overwrite and utils.file_exists(score_rank_ht_path):
        score_rank_ht = hl.read_table(score_rank_ht_path)
        return score_rank_ht
    print(f"Creating rank file for VQSR")
    ht = mt.rows()
    print('Filtering to high_quality samples and n_nonref==1...')
    if 'was_split' not in ht._fields:
        ht = ht.annotate(was_split = True)
    if 'singleton' not in ht._fields:
        ht = ht.annotate(singleton = True)
    if 'ac' not in ht._fields:
        ht = ht.annotate(ac = ht.gnomad_popmax.AC)
    ht = ht.select(
        was_split=ht.was_split,
        singleton=ht.singleton,
        ac=ht.ac,
        score=ht.vqsr.AS_VQSLOD, 
        negative_train_site=ht.vqsr.NEGATIVE_TRAIN_SITE,
        positive_train_site=ht.vqsr.POSITIVE_TRAIN_SITE,
        culprit=ht.vqsr.AS_culprit
    )

    ht = add_rank(
        ht,
        score_expr=-1 * ht.score,
        subrank_expr={
            'biallelic_rank': ~ht.was_split,
            'biallelic_singleton_rank': ~ht.was_split & ht.singleton,
            'adj_rank': ht.ac > 0,
            'adj_biallelic_rank': ~ht.was_split & (ht.ac > 0),
            'adj_singleton_rank': ht.singleton & (ht.ac > 0),
            'adj_biallelic_singleton_rank': ~ht.was_split & ht.singleton & (ht.ac > 0)
        }
    )
    ht.write(score_rank_ht_path, overwrite=True)
    score_rank_ht = hl.read_table(score_rank_ht_path)
    return score_rank_ht

In [90]:
cpg_score_rank_21ht = rank_variants(
    mt21,
    model='CPG-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)
gnomad_score_rank_21ht = rank_variants(
    gnomad_mt21,
    model='gnomAD-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)

Creating rank file for VQSR
Filtering to high_quality samples and n_nonref==1...


locus,alleles,ac
locus<GRCh38>,array<str>,int32
chr21:10270435,"[""G"",""A""]",3541
chr21:10270437,"[""C"",""G""]",3400
chr21:10270443,"[""T"",""A""]",3992
chr21:10270453,"[""C"",""A""]",3777
chr21:10270455,"[""A"",""C""]",3680
chr21:10270463,"[""T"",""C""]",15347
chr21:10270466,"[""T"",""G""]",3594
chr21:10270468,"[""C"",""T""]",3338
chr21:10270473,"[""G"",""A""]",3501
chr21:10270489,"[""C"",""T""]",3372


2021-07-26 15:58:53 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-26 15:58:56 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-26 15:59:23 Hail: INFO: wrote table with 51852 rows in 1461 partitions to gs://cpg-tob-wgs-test-tmp/concordance/tmp/vqsr/gnomAD-AS-VQSR/NA12878/score_rank.ht
    Total size: 1.99 MiB
    * Rows: 1.98 MiB
    * Globals: 7.12 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  179 rows (6.13 KiB)


In [64]:
cpg_score_rank_ht = rank_variants(
    mt,
    model='CPG-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)
gnomad_score_rank_ht = rank_variants(
    gnomad_mt,
    model='gnomAD-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)

Creating rank file for VQSR
Filtering to high_quality samples and n_nonref==1...


2021-07-27 06:46:39 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 06:47:01 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 06:47:50 Hail: INFO: wrote table with 3602324 rows in 9618 partitions to gs://cpg-tob-wgs-test-tmp/concordance/tmp/vqsr/CPG-AS-VQSR/NA12878/score_rank.ht
    Total size: 133.35 MiB
    * Rows: 133.35 MiB
    * Globals: 235.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1963 rows (74.99 KiB)


Creating rank file for VQSR
Filtering to high_quality samples and n_nonref==1...


2021-07-27 06:48:33 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 06:50:31 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-07-27 06:55:15 Hail: INFO: wrote table with 4995085 rows in 115375 partitions to gs://cpg-tob-wgs-test-tmp/concordance/tmp/vqsr/gnomAD-AS-VQSR/NA12878/score_rank.ht
    Total size: 200.44 MiB
    * Rows: 200.44 MiB
    * Globals: 7.12 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1620 rows (56.42 KiB)


### Create binned concordance

In [67]:
nbins = 100

In [68]:
def binned_concordance(
    score_rank_ht,
    sites_concordance_ht,
    model='CPG-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=False,
):
    binned_concordance_ht_path = join(work_bucket, 'vqsr', model, truth_sample, 'binned_concordance.ht')
    if not overwrite and utils.file_exists(binned_concordance_ht_path):
        ht = hl.read_table(binned_concordance_ht_path)
        return ht

    ht = sites_concordance_ht  # union of mt and truth_mt sites
    metric_ht = score_rank_ht

    # Total number of SNPs and indels in the target matrix table
    metric_snvs, metrics_indels = metric_ht.aggregate([
        hl.agg.count_where(hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])),
        hl.agg.count_where(~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1]))
    ])

    # Total number of SNPs and indels in the union
    snvs, indels = ht.aggregate([
        hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])),
        hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1]))
    ])

    ht = ht.annotate_globals(
        global_counts=hl.struct(snvs=metric_snvs, indels=metrics_indels),
        counts=hl.struct(snvs=snvs, indels=indels)
    )
    
    # Annotating the union table with the target table annotations 
    # (so the variants unique to truth won't have those score annotations)
    ht = ht.annotate(
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        score=metric_ht[ht.key].score,
        global_rank=metric_ht[ht.key].rank,
        # TP => allele is found in both data sets
        n_tp=ht.concordance[3][3] + ht.concordance[3][4] + ht.concordance[4][3] + ht.concordance[4][4],
        # FP => allele is found only in test data set
        n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]),
        # FN => allele is found only in truth data set
        n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4]))
    )
    
    # Add ranks
    ht = add_rank(ht, -1.0*ht.score)
    ht = ht.annotate(rank=[
        hl.tuple([
            'global_rank', 
            (ht.global_rank + 1) / hl.cond(
                 ht.snv,
                 ht.globals.global_counts.snvs,
                 ht.globals.global_counts.indels
             )
        ]),
        hl.tuple([
            'truth_sample_rank', 
            (ht.rank + 1) / hl.cond(
                 ht.snv,
                 ht.globals.counts.snvs,
                 ht.globals.counts.indels
             )
        ])
    ])
    ht = ht.explode(ht.rank)
    
    ht = ht.annotate(
        rank_name=ht.rank[0],
        bin=hl.int(ht.rank[1] * nbins)
    )

    ht = ht.group_by(
        'rank_name',
        'snv',
        'bin'
    ).aggregate(
        # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons
        tp=hl.agg.count_where(ht.n_tp > 0),
        fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)),
        fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0) & (ht.n_fn > 0)),
        min_score=hl.agg.min(ht.score),
        max_score=hl.agg.max(ht.score),
        n_alleles=hl.agg.count()
    ).repartition(5)
    
    ht.show()

    ht.write(binned_concordance_ht_path, overwrite=True)    
    return ht

In [69]:
binned_concordance(
    cpg_score_rank_ht, 
    sites_conc_ht, 
    model='CPG-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)
binned_concordance(
    gnomad_score_rank_ht, 
    gn_sites_conc_ht, 
    model='gnomAD-AS-VQSR', 
    truth_sample=TRUTH_SAMPLE,
    overwrite=True
)

NameError: name 'sites_conc_ht' is not defined

In [78]:
ht.show(50)

2021-07-20 10:51:39 Hail: INFO: Ordering unsorted dataset with network shuffle


rank_name,snv,bin,tp,fp,fn,min_score,max_score,n_alleles
str,bool,int32,int64,int64,int64,float64,float64,int64
"""global_rank""",False,0.0,166542,40866,0,3.65,18.0,207408
"""global_rank""",False,1.0,107259,100150,0,-691.0,3.65,207409
"""global_rank""",False,2.0,99761,107648,0,,,207409
"""global_rank""",False,3.0,96278,111131,0,,,207409
"""global_rank""",False,4.0,97126,110283,0,,,207409
"""global_rank""",False,5.0,0,1,0,,,1
"""global_rank""",False,,0,0,4096,,,4096
"""global_rank""",True,0.0,745416,75104,0,5.88,6.85,820520
"""global_rank""",True,1.0,731242,89279,0,5.19,5.88,820521
"""global_rank""",True,2.0,717914,102607,0,4.0,5.19,820521


In [3]:
def make_binned_concordance_pd(
    work_bucket: str,
    model_names: List[str],
    truth_samples: List[str],
) -> pd.DataFrame:
    """
    Creates a pandas DF containing the binned concordance results for all given truth samples / models.
    :param list of str truth_samples: List of truth samples to include
    :param list of str or dict of str -> str models: 
        Models to include. Either a list of the model ids, 
        or a dict with model id -> model name for display
    :return: Pandas dataframe with binned concordance results
    :rtype: DataFrame
    """
    
    def get_binned_concordance_ht(model_name, truth_sample):
        ht = hl.read_table(join(work_bucket, 'vqsr', model_name, truth_sample, 'binned_concordance.ht'))
        try:
            ht = ht.drop('global_annotation_descriptions')
        except:
            pass
        ht = ht.annotate_globals(
            filtering_model = ht.filtering_model.annotate(
                snv_cutoff = ht.filtering_model.snv_cutoff.annotate(
                    bin = hl.float(ht.filtering_model.snv_cutoff.bin)
                ),
                indel_cutoff = ht.filtering_model.indel_cutoff.annotate(
                    bin = hl.float(ht.filtering_model.indel_cutoff.bin)
                ),
            )
        )    
        return ht
    
    # Combine binned concordance results for multiple truth samples and/or models into a single Table.
    hts = []
    for truth_sample in truth_samples:
        for model_name in model_names:
            ht = get_binned_concordance_ht(model_name, truth_sample)
            ht = ht.annotate(truth_sample=truth_sample, model=model_name)
            hts.append(ht)
    ht = hts[0].union(*hts[1:])

    def f_score(n, df):
        return (1 + n**2) * df['precision'] * df['recall'] / (n**2 * df['precision'] + df['recall'])

    def compute_cumul_metrics(df: pd.DataFrame) -> pd.DataFrame:
        """
        Computes cumulative metrics on a pandas DF.
        """
        df = df.sort_values(by=['bin'])
        df['cum_tp'] = df['tp'].cumsum()
        df['cum_fp'] = df['fp'].cumsum()
        total_true = df['tp'].sum() + df['fn'].sum()
        total_false = df['fp'].sum()
        df['cum_fn'] = total_true - df['cum_tp']
        df['cum_tn'] = total_false - df['cum_fp']
        df['precision'] = df['cum_tp'] / (df['cum_tp'] + df['cum_fp'])
        df['recall'] = df['cum_tp'] / (df['cum_tp'] + df['cum_fn'])
        df['cum_alleles'] = df['n_alleles'].cumsum()
        df['f1'] = f_score(2, df)
        df['f2'] = f_score(2, df)
        df['f3'] = f_score(3, df)
        df['f4'] = f_score(4, df)
        df['f10'] = f_score(10, df)
        return df[['bin', 'min_score', 'max_score', 'n_alleles', 'tp', 'fp', 'fn', 'cum_alleles', 
                   'cum_tp', 'cum_fp', 'cum_fn', 'cum_tn', 'precision', 'recall', 
                   'f1', 'f2', 'f3', 'f4', 'f10']]

    df = ht.to_pandas()
    df = df.groupby(['rank_name', 'truth_sample', 'model', 'snv']).apply(compute_cumul_metrics)
    return df.fillna(-1).groupby(['rank_name', 'truth_sample', 'model', 'snv'])

df = make_binned_concordance_pd(
    work_bucket,
    model_names=['CPG-AS-VQSR', 'gnomAD-AS-VQSR'],
    truth_samples=[TRUTH_SAMPLE],
)

In [29]:
ht.show(50)

rank_name,snv,bin,tp,fp,fn,min_score,max_score,n_alleles,truth_sample,model
str,bool,int32,int64,int64,int64,float64,float64,int64,str,str
"""global_rank""",False,0,68,0,0,18.0,18.0,68,"""NA12878""","""CPG-AS-VQSR"""
"""global_rank""",False,0,68,0,0,18.0,18.0,68,"""NA12878""","""gnomAD-AS-VQSR"""
"""global_rank""",False,1,80,0,0,17.9,18.0,80,"""NA12878""","""CPG-AS-VQSR"""
"""global_rank""",False,1,80,0,0,17.9,18.0,80,"""NA12878""","""gnomAD-AS-VQSR"""
"""global_rank""",False,2,97,0,0,17.9,17.9,97,"""NA12878""","""CPG-AS-VQSR"""
"""global_rank""",False,2,97,0,0,17.9,17.9,97,"""NA12878""","""gnomAD-AS-VQSR"""
"""global_rank""",False,3,74,0,0,17.8,17.9,74,"""NA12878""","""CPG-AS-VQSR"""
"""global_rank""",False,3,74,0,0,17.8,17.9,74,"""NA12878""","""gnomAD-AS-VQSR"""
"""global_rank""",False,4,79,0,0,17.8,17.8,79,"""NA12878""","""CPG-AS-VQSR"""
"""global_rank""",False,4,79,0,0,17.8,17.8,79,"""NA12878""","""gnomAD-AS-VQSR"""


In [None]:
df.head()

In [9]:
qc_plots_settings = {
    'mean_point_size': 4.0,
    'min_point_size': 1.0,
    'max_point_size': 16.0,
    'label_text_font_size': "14pt",
    'title.text_font_size': "16pt",
    'subtitle.text_font_size': "14pt",
    'axis.axis_label_text_font_size': "16pt",
    'axis.axis_label_text_font_style': "normal",
    'axis.major_label_text_font_size': "14pt"
}

def set_plots_defaults(p: Plot) -> None:
    p.legend.label_text_font_size = qc_plots_settings['label_text_font_size']
    p.title.text_font_size = qc_plots_settings['title.text_font_size']
    p.axis.axis_label_text_font_size = qc_plots_settings['axis.axis_label_text_font_size']
    p.axis.axis_label_text_font_style = qc_plots_settings['axis.axis_label_text_font_style']
    p.axis.major_label_text_font_size = qc_plots_settings['axis.major_label_text_font_size']

def get_point_size_col(data: pd.Series, size_prop: str) -> pd.Series:
    """
    Given a data Series, returns the corresponding point size either:
    - Constant to qc_plots_settings['mean_point_size'] if `size_prop` is None
    - Radius proportional to data, if `size_prop` is 'radius'
    - Area proportional to data, if `size_prop` is 'area'
    Mean, min and max point  sizes are extracted from qc_plots_settings
    :param Series data: Input data series
    :param str size_prop: One of None, 'radius' or 'area'
    :return: Series with corresponding point size for each data point
    :rtype: Series
    """
    if size_prop is None:
        return pd.Series(len(data) * [qc_plots_settings['mean_point_size']])
    else:
        mean_data = np.mean(data)
        if size_prop == 'radius':
            return data.apply(lambda x: max(qc_plots_settings['min_point_size'], 
                                            min(qc_plots_settings['max_point_size'], 
                                                qc_plots_settings['mean_point_size'] * (x / mean_data))))
        elif size_prop == 'area':
            return data.apply(
                lambda x: max(
                    qc_plots_settings['min_point_size'], 
                    min(
                        qc_plots_settings['max_point_size'], 
                        qc_plots_settings['mean_point_size'] * np.pi * (np.sqrt(x / mean_data) / np.pi)
                    )
                )
            )
        else:
            raise ValueError(f"{size_prop} is not a supported value for argument `size_prop`")
            
def plot_concordance_pr(
    pr_df: pd.DataFrame,
    snv: bool,
    colors: Dict[str, str] = None,
    size_prop: str = None,
    bins_to_label: List[int] = None
) -> Column:
    """
    Generates plots showing Precision/Recall curves for truth samples:
    Two tabs:
    - One displaying the PR curve with ranking computed on the entire data
    - One displaying the PR curve with ranking computed on the truth sample only

    Within each tab, a row of n_truth_samples.
    The input to this function should come out of the `get_binned_concordance_pd` function, 
    which creates  a DataFrame containing the necessary metris for PR plotting and is grouped 
    by 'rank_name', 'truth_sample', 'model' and 'snv'.
    :param DataFrame pr_df: 
           Input Dataframe
    :param bool snv: 
           Whether to plot SNVs or Indels
    :param dict of str -> str colors: 
           Optional colors to use (model name -> desired color)
    :param str size_prop: 
           Either 'radius' or 'area' can be specified. If either is specified, 
           the points will be sized proportionally to the amount of data in that point.
    :param list of int bins_to_label: 
           Bins to label
    :return Bokeh grid of plots
    :rtype Tabs
    """

    if colors is None:
        # Get a palette automatically
        from bokeh.palettes import d3
        models = sorted(list(set([g[2] for g in pr_df.groups])))
        palette = d3['Category10'][max(3, len(models))]
        colors = {model: palette[i] for i, model in enumerate(models)}

    tabs = []
    rank = 'global_rank'  # 'truth_sample_rank',
#         plot_row = []
#         for truth_sample in set([g[1] for g in pr_df.groups]):
#             hover = HoverTool(tooltips=[
#                     ("model", "@model"),
#                     ("bin", "@bin"),
#                     ("score (min, max)", "(@min_score, @max_score)"),
#                     ('n_alleles', '@n_alleles'),
#                     ('cum_alleles', '@cum_alleles'),
#                     ("data (x,y)", "($x, $y)")
#                 ],
#                 mode='vline'
#             )
    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom".split(',')

    hover = HoverTool(
        tooltips=[
            ("model", "@model"),
            ("bin", "@bin"),
            ("score (min, max)", "(@min_score, @max_score)"),
            ('n_alleles', '@n_alleles'),
            ('cum_alleles', '@cum_alleles'),
            ('recall', '@recall'),
            ('precision', '@precision'),
            ('cum_fp', '@cum_fp'),
            ('cum_tp', '@cum_tp'),
            ('cum_fn', '@cum_fn'),
            ('f1', '@f1'),
            ('f2', '@f2'),
        ],
        # display a tooltip whenever the cursor is vertically in line with a glyph
        mode='vline'
    )
    p = figure(
        title=TRUTH_SAMPLE,
        x_axis_label='Recall',
        y_axis_label='Precision',
        tools=[hover] + [tool for tool in TOOLS if tool != 'hover'],
#         x_range=(0, 1.05), 
#         y_range=(0, 1)
    )
    p.xaxis[0].formatter = NumeralTickFormatter(format="0%")
    p.yaxis[0].formatter = NumeralTickFormatter(format="0.0%")

    circles = []
    for model in set([g[2] for g in pr_df.groups]):
        data = pr_df.get_group((rank, TRUTH_SAMPLE, model, snv)).copy()
        data['model'] = [model] * len(data)
        data['size'] = get_point_size_col(data['n_alleles'], size_prop)
        data['x_offset'] = data['recall'] + 0.025
        data['y_offset'] = data['precision']
        data['f2'] = [str("{:.2f}".format(t)) for t in data['f2']]
        source = ColumnDataSource(data)
        if bins_to_label is not None:
            label_data = data.copy()
            label_data = label_data.loc[label_data.bin.isin(bins_to_label)].copy()
            label_data = ColumnDataSource(label_data)
            p.circle(
                'recall',
                'precision',
                color=colors[model], 
                source=label_data
            )
            p.add_layout(
                LabelSet(
                    x='x_offset',
                    y='precision',
                    text='f2',
                    text_font_size='6pt',
                    x_offset=-10, 
                    y_offset=1,
                    text_color=colors[model],
                    source=label_data
                )
            )
        p.line(
            'recall',
            'precision',
            source=source,
            line_width=2,
        )
#             p.tools = [hover] + [tool for tool in TOOLS if tool != 'hover']
#     set_plots_defaults(p)
    return p
#         tabs.append(Panel(child=Row(children=plot_row), title=rank))
#     return Tabs(tabs=tabs)

# nbins // 20
p = plot_concordance_pr(df, snv=True, bins_to_label=range(0, nbins + 1, 1))
show(p)

In [151]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,bin,min_score,max_score,n_alleles,tp,fp,fn,cum_alleles,cum_tp,cum_fp,cum_fn,cum_tn,precision,recall
rank_name,truth_sample,model,snv,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
global_rank,NA12878,VQSR,False,0,0.0,17.9027,18.0431,10370,8881,1489,0,10370,8881,1489,562181,468590,0.856413,0.015552
global_rank,NA12878,VQSR,False,1,1.0,17.8035,17.9027,10370,8917,1453,0,20740,17798,2942,553264,467137,0.858149,0.031166
global_rank,NA12878,VQSR,False,2,2.0,17.6876,17.8035,10371,8952,1419,0,31111,26750,4361,544312,465718,0.859824,0.046843
global_rank,NA12878,VQSR,False,3,3.0,17.5437,17.6876,10370,8918,1452,0,41481,35668,5813,535394,464266,0.859864,0.062459
global_rank,NA12878,VQSR,False,4,4.0,17.357,17.5436,10371,8945,1426,0,51852,44613,7239,526449,462840,0.860391,0.078123
global_rank,NA12878,VQSR,True,102,0.0,6.6075,6.8471,41026,37549,3477,0,41026,37549,3477,3050758,1013714,0.915249,0.012158
global_rank,NA12878,VQSR,True,103,1.0,6.5465,6.6075,41026,37506,3520,0,82052,75055,6997,3013252,1010194,0.914725,0.024303
global_rank,NA12878,VQSR,True,104,2.0,6.4959,6.5465,41026,37538,3488,0,123078,112593,10485,2975714,1006706,0.91481,0.036458
global_rank,NA12878,VQSR,True,105,3.0,6.4502,6.4959,41026,37469,3557,0,164104,150062,14042,2938245,1003149,0.914432,0.04859
global_rank,NA12878,VQSR,True,106,4.0,6.4069,6.4502,41026,37513,3513,0,205130,187575,17555,2900732,999636,0.91442,0.060737


In [32]:
bins_to_label=range(0, nbins + 1, nbins // 20)
data = df.get_group(('truth_sample_rank', truth_sample, 'VQSR', True)).copy()
a = data.loc[data.bin.isin(bins_to_label)]
print(a)

NameError: name 'truth_sample' is not defined