# Concordance analysis between a sparse and a dense MatrixTable

If we naively `densify` the sparse MT and perform the concordance analysis, all the variants present only in the dense MT
will be matched to missing in the sparse MT

This is due to the fact that `densify` never adds (or removes) rows, it just fills up missing entries in existing rows.

To address this problem, we must first create 'empty' rows in the sparse MT corresponding to variants present only in the dense matrix
before densifying.

See: https://discuss.hail.is/t/concordance-with-sparse-matrixtable/2086/2

In [1]:
import hail as hl;

# All datasets in TOB-WGS are using GRCh38
hl.init(default_reference='GRCh38');

Running on Apache Spark version 3.1.2
SparkUI available at http://172.20.180.11:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.67-bafea6b18247
LOGGING: writing to /home/loithi/code/tob-deepdive/hail-20210625-2056-0.2.67-bafea6b18247.log


In [2]:
# generate a small random MatrixTable
sparse = hl.balding_nichols_model(1,2,20)

# change column index to a more usual format
sparse = sparse.key_cols_by(s=hl.format('sample_%d', sparse.sample_idx))

# create random gaps in the list of variants' loci
sparse = sparse.key_rows_by(locus = hl.locus(sparse.locus.contig, sparse.locus.position * 2 + hl.int32(hl.rand_unif(0,2))), alleles = sparse.alleles)

# transform into sparse MatrixTable by adding a random END field
# each variant is transformed into a block with random length (around 2)
sparse = sparse.annotate_entries(END=sparse.locus.position+hl.int32(hl.rand_pois(1)))

sparse.show(20,2)

2021-06-25 20:56:25 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 2 samples, and 20 variants...
2021-06-25 20:56:30 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_0','sample_1','sample_1'
locus,alleles,GT,END,GT,END
locus<GRCh38>,array<str>,call,int32,call,int32
chr1:2,"[""A"",""C""]",0/1,4,1/1,2
chr1:5,"[""A"",""C""]",0/1,6,0/1,6
chr1:7,"[""A"",""C""]",0/0,9,0/0,7
chr1:8,"[""A"",""C""]",1/1,8,0/1,9
chr1:11,"[""A"",""C""]",1/1,13,0/1,12
chr1:12,"[""A"",""C""]",0/1,13,0/0,13
chr1:15,"[""A"",""C""]",0/1,16,0/0,16
chr1:16,"[""A"",""C""]",0/0,16,0/0,16
chr1:19,"[""A"",""C""]",1/1,19,1/1,22
chr1:20,"[""A"",""C""]",1/1,20,1/1,21


In [3]:
# create another small dense MatrixTable
dense = hl.balding_nichols_model(1,2,20)
dense = dense.key_cols_by(s=hl.format('sample_%d', dense.sample_idx))
dense = dense.key_rows_by(locus = hl.locus(dense.locus.contig, dense.locus.position * 2 + hl.int32(hl.rand_unif(0,2))), alleles = dense.alleles)
dense.show(20,2)

2021-06-25 20:56:30 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 2 samples, and 20 variants...
2021-06-25 20:56:31 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_1'
locus,alleles,GT,GT
locus<GRCh38>,array<str>,call,call
chr1:3,"[""A"",""C""]",0/0,1/1
chr1:4,"[""A"",""C""]",0/1,0/1
chr1:6,"[""A"",""C""]",1/1,0/1
chr1:9,"[""A"",""C""]",1/1,0/1
chr1:11,"[""A"",""C""]",0/1,0/1
chr1:13,"[""A"",""C""]",0/0,0/0
chr1:14,"[""A"",""C""]",0/1,0/1
chr1:16,"[""A"",""C""]",0/0,0/1
chr1:18,"[""A"",""C""]",0/1,1/1
chr1:21,"[""A"",""C""]",0/1,1/1


In [4]:
# try the intuitive way: densify and perform the concordance analysis

dense_naive = hl.experimental.densify(sparse)
global_conc, cols_conc, rows_conc = hl.concordance(dense_naive, dense)

# display the confusion matrix
global_conc

2021-06-25 20:56:33 Hail: INFO: concordance: including 2 shared samples (2 total on left, 2 total on right)
2021-06-25 20:56:33 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'bn' -> 'bn_1'
    'locus' -> 'locus_1'
2021-06-25 20:56:34 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:35 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:36 Hail: INFO: concordance: total concordance 64.29%
2021-06-25 20:56:36 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


[[0, 0, 6, 12, 8],
 [0, 0, 0, 0, 0],
 [7, 0, 2, 2, 0],
 [8, 0, 0, 3, 2],
 [11, 0, 0, 1, 4]]

In [5]:
# before we densify the sparse matrix we need to create 'dummy' rows
# that correpond to variants in the dense MT that are not present in the sparse MT

# get the rows of the dense MatrixTable whose key does not appear in the sparse MatrixTable
extra_rows = dense.anti_join_rows(sparse.rows())

# add empty entry END so the entries schemas match
extra_rows = extra_rows.annotate_entries(END=hl.missing('tint32'))

extra_rows.show()

2021-06-25 20:56:37 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:37 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_0','sample_1','sample_1'
locus,alleles,GT,END,GT,END
locus<GRCh38>,array<str>,call,int32,call,int32
chr1:3,"[""A"",""C""]",0/0,,1/1,
chr1:4,"[""A"",""C""]",0/1,,0/1,
chr1:6,"[""A"",""C""]",1/1,,0/1,
chr1:9,"[""A"",""C""]",1/1,,0/1,
chr1:13,"[""A"",""C""]",0/0,,0/0,
chr1:14,"[""A"",""C""]",0/1,,0/1,
chr1:18,"[""A"",""C""]",0/1,,1/1,
chr1:21,"[""A"",""C""]",0/1,,1/1,
chr1:23,"[""A"",""C""]",0/1,,0/0,
chr1:24,"[""A"",""C""]",0/0,,0/0,


In [6]:
# add extra rows to the sparse MatrixTable
sparse = sparse.union_rows(extra_rows)

# remove entries from the added rows, so these can be densified
sparse = sparse.filter_entries(hl.is_defined(extra_rows.rows()[sparse.row_key]), False)

# densify
densified = hl.experimental.densify(sparse)

densified.show()

2021-06-25 20:56:39 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:40 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:40 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:40 Hail: INFO: Coerced sorted dataset
2021-06-25 20:56:40 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_1'
locus,alleles,GT,GT
locus<GRCh38>,array<str>,call,call
chr1:2,"[""A"",""C""]",0/1,1/1
chr1:3,"[""A"",""C""]",0/1,
chr1:4,"[""A"",""C""]",0/1,
chr1:5,"[""A"",""C""]",0/1,0/1
chr1:6,"[""A"",""C""]",0/1,0/1
chr1:7,"[""A"",""C""]",0/0,0/0
chr1:8,"[""A"",""C""]",1/1,0/1
chr1:9,"[""A"",""C""]",,0/1
chr1:11,"[""A"",""C""]",1/1,0/1
chr1:12,"[""A"",""C""]",0/1,0/0


In [9]:
# finally we can perform the concordance analysis
global_conc_proper, cols_conc_proper, rows_con_proper = hl.concordance(densified, dense)

# if instead we simply densify before performing the concordance analysis
dense_naive = hl.experimental.densify(sparse)

# display confusion matrix
global_conc



2021-06-25 20:58:44 Hail: INFO: concordance: including 2 shared samples (2 total on left, 2 total on right)
2021-06-25 20:58:44 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'alleles' -> 'alleles_1'
    'bn' -> 'bn_1'
    'locus' -> 'locus_1'
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:45 Hail: INFO: Coerced sorted dataset
2021-06-25 20:58:46 Hail: INFO: concordance: total concordance 61.54%


[[0, 0, 6, 12, 8],
 [0, 0, 0, 0, 0],
 [7, 0, 2, 2, 0],
 [8, 0, 0, 3, 2],
 [11, 0, 0, 1, 4]]

In [10]:
global_conc_proper

[[0, 0, 1, 8, 5],
 [0, 0, 0, 0, 0],
 [7, 0, 5, 2, 1],
 [8, 0, 2, 6, 3],
 [11, 0, 0, 2, 5]]

Note how 12 variants previously classified as missing in the sparse MT (first row of the matrix) are now properly classified (in the 3x3 bottom right matrix)