In [4]:

"""
Formulas for QV estimation as stated in 

Rhie, A., Walenz, B.P., Koren, S. et al.
Merqury: reference-free quality, completeness, and phasing assessment for genome assemblies.
Genome Biol 21, 245 (2020). https://doi.org/10.1186/s13059-020-02134-9

Methods section "Consensus quality (QV) estimation"
"""

import math

def prob_base_correct(kmer_shared, kmer_total, kmer_size=21):
    
    return (kmer_shared / kmer_total) ** (1/kmer_size)


def base_error_rate(kmer_assembly_only, kmer_total, kmer_size=21):
       
    return 1 - (1 - kmer_assembly_only / kmer_total) ** (1/kmer_size)


def qv_estimate(error_rate):
    
    return -10 * math.log10(error_rate)

output = """
NA19650_hgsvc_pbsq2-clr_1000-flye.h1-un.arrow-p1        12000233        2850910652      36.9714 0.000200844
NA19650_hgsvc_pbsq2-clr_1000-flye.h2-un.arrow-p1        12062704        2850642654      36.9484 0.000201911
Both    24062937        5701553306      36.9599 0.000201377
"""

hap1_kmer_only = 12000233
hap2_kmer_only = 12062704

hap1_kmer_shared = 2850910652
hap2_kmer_shared = 2850642654

print('hap1 P(correct): {}'.format(prob_base_correct(hap1_kmer_shared, hap1_kmer_shared + hap1_kmer_only)))
print('hap2 P(correct): {}'.format(prob_base_correct(hap2_kmer_shared, hap1_kmer_shared + hap2_kmer_only)))
hap1_error = base_error_rate(hap1_kmer_only, hap1_kmer_only + hap1_kmer_shared)
hap2_error = base_error_rate(hap2_kmer_only, hap2_kmer_only + hap1_kmer_shared)
print('hap1 P(error): {}'.format(hap1_error))
print('hap2 P(error): {}'.format(hap2_error))
print('hap1 QV: {}'.format(qv_estimate(hap1_error)))
print('hap2 QV: {}'.format(qv_estimate(hap2_error)))

hap1 P(correct): 0.9997999995774884
hap2 P(correct): 0.9997944850135683
hap1 P(error): 0.00020000042251155925
hap2 P(error): 0.00020103928843706598
hap1 QV: 36.98969086864794
hap2 QV: 36.9671906156432
