# Understanding sparse MatrixTables and `densify()`



In [None]:
import hail as hl;

# All datasets in TOB-WGS are using GRCh38
hl.init(default_reference='GRCh38');


In [23]:
# generate a small random MatrixTable
ht1 = hl.balding_nichols_model(1,2,20)
ht1.show()

2021-06-24 12:31:39 Hail: INFO: balding_nichols_model: generating genotypes for 1 populations, 2 samples, and 20 variants...
2021-06-24 12:31:40 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
locus,alleles,GT,GT
locus<GRCh38>,array<str>,call,call
chr1:1,"[""A"",""C""]",0/1,1/1
chr1:2,"[""A"",""C""]",1/1,0/1
chr1:3,"[""A"",""C""]",1/1,0/1
chr1:4,"[""A"",""C""]",0/1,0/1
chr1:5,"[""A"",""C""]",1/1,0/1
chr1:6,"[""A"",""C""]",0/1,0/1
chr1:7,"[""A"",""C""]",1/1,0/1
chr1:8,"[""A"",""C""]",0/1,0/0
chr1:9,"[""A"",""C""]",0/1,0/0
chr1:10,"[""A"",""C""]",1/1,1/1


In [24]:
# create gaps in the variants' loci list
ht1 = ht1.key_rows_by(locus=hl.locus(ht1.locus.contig,ht1.locus.position*2))
ht1.show()

2021-06-24 12:31:41 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,0,1
locus,GT,GT
locus<GRCh38>,call,call
chr1:2,0/1,1/1
chr1:4,1/1,0/1
chr1:6,1/1,0/1
chr1:8,0/1,0/1
chr1:10,1/1,0/1
chr1:12,0/1,0/1
chr1:14,1/1,0/1
chr1:16,0/1,0/0
chr1:18,0/1,0/0
chr1:20,1/1,1/1


In [25]:
# remove randomly half of the entries
ht1 = ht1.filter_entries(hl.rand_bool(.5))

# transform into sparse MatrixTable by adding a random END field
# each variant is transformed into a block with random length (around 2)
ht1 = ht1.annotate_entries(END=ht1.locus.position+hl.int32(hl.rand_pois(1)))

ht1.show()

2021-06-24 12:31:42 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,0,0,1,1
locus,GT,END,GT,END
locus<GRCh38>,call,int32,call,int32
chr1:2,,,,
chr1:4,1/1,6.0,,
chr1:6,,,,
chr1:8,,,0/1,9.0
chr1:10,1/1,12.0,,
chr1:12,0/1,13.0,,
chr1:14,1/1,18.0,,
chr1:16,,,,
chr1:18,,,0/0,20.0
chr1:20,1/1,20.0,1/1,24.0


In [27]:
# densify
ht2 = hl.experimental.densify(ht1)
ht2.show()

2021-06-24 12:33:02 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,0,1
locus,GT,GT
locus<GRCh38>,call,call
chr1:2,,
chr1:4,1/1,
chr1:6,1/1,
chr1:8,,0/1
chr1:10,1/1,
chr1:12,0/1,
chr1:14,1/1,
chr1:16,1/1,
chr1:18,1/1,0/0
chr1:20,1/1,1/1


In [28]:
# join the sparse and the densified MatrixTable, display the GT before densifying
# and the GT after densifying side by side
ht3 = ht2.select_entries(GTprior = ht1[ht2.row_key, ht2.col_key].GT,
                         GTpost = ht2.GT,
                         END = ht1[ht2.row_key, ht2.col_key].END)
ht3.show(20,2)

2021-06-24 12:33:35 Hail: INFO: Coerced sorted dataset
2021-06-24 12:33:35 Hail: INFO: Coerced sorted dataset
2021-06-24 12:33:35 Hail: INFO: Coerced sorted dataset


Unnamed: 0_level_0,0,0,0,1,1,1
locus,GTprior,GTpost,END,GTprior,GTpost,END
locus<GRCh38>,call,call,int32,call,call,int32
chr1:2,,,,,,
chr1:4,1/1,1/1,6.0,,,
chr1:6,,1/1,,,,
chr1:8,,,,0/1,0/1,9.0
chr1:10,1/1,1/1,12.0,,,
chr1:12,0/1,0/1,13.0,,,
chr1:14,1/1,1/1,18.0,,,
chr1:16,,1/1,,,,
chr1:18,,1/1,,0/0,0/0,20.0
chr1:20,1/1,1/1,20.0,1/1,1/1,24.0


The algorithm of `densify` proceeds the following way:

for each **missing** entry
* find the previous (in row_key order) non missing entry in the same column for which END is defined
* if the END overlaps the locus, replace the missing entry by the previous entry

For example
1. chr1:6 sample 0, the missing genotype is replaced by 1/1 of chr1:4 (END=6)
2. chr1:16 sample 0, the missing genotype is replaced by 1/1 of chr1:14 (END=18)
3. chr1:17 sample 0, the missing genotype is replaced by 1/1 of chr1:14 (END=18)
4. chr1:8 sample 0, the missing gentype is **not** replaced because the previous variant is chr1:4 (END=6) and does not overlap position 8