# Concordance analysis between a sparse and a dense MatrixTable

If we naively `densify` the sparse MT and perform the correspondance analysis, all the variants present only in the dense MT
will be matched to missing in the sparse MT

To address this problem, we must first create 'empty' rows in the sparse MT corresponding to variants present only in the dense matrix
before densifying.

See: https://discuss.hail.is/t/concordance-with-sparse-matrixtable/2086/2

In [None]:
import hail as hl;

# All datasets in TOB-WGS are using GRCh38
hl.init(default_reference='GRCh38');

In [48]:
# generate a small random MatrixTable
sparse = hl.balding_nichols_model(1,2,20)

# change column index to a more usual format
sparse = sparse.key_cols_by(s=hl.format('sample_%d', sparse.sample_idx))

# create random gaps in the list of variants' loci
sparse = sparse.key_rows_by(locus = hl.locus(sparse.locus.contig, sparse.locus.position * 2 + hl.int32(hl.rand_unif(0,2))), alleles = sparse.alleles)

# transform into sparse MatrixTable by adding a random END field
# each variant is transformed into a block with random length (around 2)
sparse = sparse.annotate_entries(END=sparse.locus.position+hl.int32(hl.rand_pois(1)))

sparse.show(20,2)

2021-06-25 05:34:06 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 2 samples, and 20 variants...
2021-06-25 05:34:07 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_0','sample_1','sample_1'
locus,alleles,GT,END,GT,END
locus<GRCh38>,array<str>,call,int32,call,int32
chr1:2,"[""A"",""C""]",1/1,2,0/1,3
chr1:5,"[""A"",""C""]",1/1,6,1/1,5
chr1:6,"[""A"",""C""]",0/1,9,0/1,8
chr1:8,"[""A"",""C""]",0/1,8,1/1,9
chr1:10,"[""A"",""C""]",0/0,11,0/0,10
chr1:12,"[""A"",""C""]",1/1,13,1/1,12
chr1:15,"[""A"",""C""]",1/1,16,1/1,16
chr1:17,"[""A"",""C""]",0/1,17,0/0,17
chr1:18,"[""A"",""C""]",1/1,19,1/1,18
chr1:21,"[""A"",""C""]",1/1,21,1/1,22


In [49]:
# create another small dense MatrixTable
dense = hl.balding_nichols_model(1,2,20)
dense = dense.key_cols_by(s=hl.format('sample_%d', dense.sample_idx))
dense = dense.key_rows_by(locus = hl.locus(dense.locus.contig, dense.locus.position * 2 + hl.int32(hl.rand_unif(0,2))), alleles = dense.alleles)
dense.show(20,2)

2021-06-25 05:34:51 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 2 samples, and 20 variants...
2021-06-25 05:34:52 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_1'
locus,alleles,GT,GT
locus<GRCh38>,array<str>,call,call
chr1:2,"[""A"",""C""]",0/0,1/1
chr1:5,"[""A"",""C""]",0/0,0/1
chr1:7,"[""A"",""C""]",0/1,0/0
chr1:9,"[""A"",""C""]",1/1,1/1
chr1:11,"[""A"",""C""]",1/1,0/1
chr1:13,"[""A"",""C""]",0/0,0/0
chr1:14,"[""A"",""C""]",0/0,0/1
chr1:17,"[""A"",""C""]",1/1,0/0
chr1:19,"[""A"",""C""]",0/1,1/1
chr1:20,"[""A"",""C""]",0/1,1/1


In [50]:
# before we densify the sparse matrix we need to create 'dummy' rows
# that correpond to variants in the dense MT that are not present in the sparse MT

# get the rows of the dense MatrixTable whose key does not appear in the sparse MatrixTable
extra_rows = dense.anti_join_rows(sparse.rows())

# add empty entry END so the entries schemas match
extra_rows = extra_rows.annotate_entries(END=hl.missing('tint32'))

extra_rows.show()

2021-06-25 05:35:00 Hail: INFO: Coerced sorted dataset
2021-06-25 05:35:00 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_0','sample_1','sample_1'
locus,alleles,GT,END,GT,END
locus<GRCh38>,array<str>,call,int32,call,int32
chr1:7,"[""A"",""C""]",0/1,,0/0,
chr1:9,"[""A"",""C""]",1/1,,1/1,
chr1:11,"[""A"",""C""]",1/1,,0/1,
chr1:13,"[""A"",""C""]",0/0,,0/0,
chr1:14,"[""A"",""C""]",0/0,,0/1,
chr1:19,"[""A"",""C""]",0/1,,1/1,
chr1:20,"[""A"",""C""]",0/1,,1/1,
chr1:24,"[""A"",""C""]",0/0,,0/1,
chr1:27,"[""A"",""C""]",0/0,,0/1,
chr1:33,"[""A"",""C""]",1/1,,1/1,


In [51]:
# add extra rows to the sparse MatrixTable
sparse = sparse.union_rows(extra_rows)

# remove entries from the added rows, so these can be densified
sparse = sparse.filter_entries(hl.is_defined(extra_rows.rows()[sparse.row_key]), False)

# densify
densified = hl.experimental.densify(sparse)

densified.show()

2021-06-25 05:35:09 Hail: INFO: Coerced sorted dataset
2021-06-25 05:35:09 Hail: INFO: Coerced sorted dataset
2021-06-25 05:35:09 Hail: INFO: Coerced sorted dataset
2021-06-25 05:35:09 Hail: INFO: Coerced sorted dataset
2021-06-25 05:35:09 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,'sample_0','sample_1'
locus,alleles,GT,GT
locus<GRCh38>,array<str>,call,call
chr1:2,"[""A"",""C""]",1/1,0/1
chr1:5,"[""A"",""C""]",1/1,1/1
chr1:6,"[""A"",""C""]",0/1,0/1
chr1:7,"[""A"",""C""]",0/1,0/1
chr1:8,"[""A"",""C""]",0/1,1/1
chr1:9,"[""A"",""C""]",,1/1
chr1:10,"[""A"",""C""]",0/0,0/0
chr1:11,"[""A"",""C""]",0/0,
chr1:12,"[""A"",""C""]",1/1,1/1
chr1:13,"[""A"",""C""]",1/1,


In [55]:
# finally we can perform the concordance analysis
global_conc, cols_conc, rows_conc = hl.concordance(densified, dense)
rows_conc.show()

2021-06-25 05:37:03 Hail: INFO: concordance: including 2 shared samples (2 total on left, 2 total on right)
2021-06-25 05:37:03 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'bn' -> 'bn_1'
    'locus' -> 'locus_1'
    'alleles' -> 'alleles_1'
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:04 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:05 Hail: INFO: concordance: total concordance 40.62%
2021-06-25 05:37:06 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:06 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:06 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:06 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:06 Hail: INFO: Coerced sorted dataset
2021-06-25 05:37:07 Hail: INFO: C

locus,alleles,concordance,n_discordant
locus<GRCh38>,array<str>,array<array<int64>>,int64
chr1:2,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,1],[0,0,1,0,0]]",2
chr1:5,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,1,1,0]]",2
chr1:6,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[2,0,0,0,0],[0,0,0,0,0]]",0
chr1:7,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,1,1,0],[0,0,0,0,0]]",1
chr1:8,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[1,0,0,0,0],[1,0,0,0,0]]",0
chr1:9,"[""A"",""C""]","[[0,0,0,0,1],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,1]]",0
chr1:10,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[2,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]",0
chr1:11,"[""A"",""C""]","[[0,0,0,1,0],[0,0,0,0,0],[0,0,0,0,1],[0,0,0,0,0],[0,0,0,0,0]]",1
chr1:12,"[""A"",""C""]","[[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[2,0,0,0,0]]",0
chr1:13,"[""A"",""C""]","[[0,0,1,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,1,0,0]]",1
