In [51]:
"""
This is a different approach to the assembly gap analysis that was not used
in the Science 2021 paper (already includes T2T as reference assembly)
- this notebook computes statistics on the basis of the input files generated
in "collect_assembly_breaks_exp"
"""

import os as os
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

gap_files = [
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt20/Gaps_T2T_MQ20_H64.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt20/Gaps_T2T_MQ20_H70.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt00/Gaps_T2T_MQ00_H64.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt00/Gaps_T2T_MQ00_H70.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt00/Gaps_GRCh38_MQ00_H64.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt00/Gaps_GRCh38_MQ00_H70.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt20/Gaps_GRCh38_MQ20_H64.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt20/Gaps_GRCh38_MQ20_H70.tsv'
]

def select_tech_only_breaks(df, min_length=0):
    
    clr_max = df['COUNT_CLR'].max()
    hifi_max = df['COUNT_HIFI'].max()
    
    most_threshold = 0.7
    
    most_clr_threshold = np.floor(clr_max * most_threshold)
    most_hifi_threshold = np.floor(hifi_max * most_threshold)
    
    most_clr_break = df['COUNT_CLR'] > most_clr_threshold
    most_hifi_break = df['COUNT_HIFI'] > most_hifi_threshold
        
    no_clr_break = df['COUNT_CLR'] == 0
    no_hifi_break = df['COUNT_HIFI'] == 0
    
    select_clr_only_breaks = np.logical_and(most_clr_break, no_hifi_break)
    select_hifi_only_breaks = np.logical_and(most_hifi_break, no_clr_break)
    select_both_breaks = np.logical_and(most_hifi_break, most_clr_break)
    
    # subset by length
    sub = df.loc[df['length'] >= min_length, :]
    
    clr_only_breaks = sub.loc[select_clr_only_breaks, :]
    
    hifi_only_breaks = sub.loc[select_hifi_only_breaks, :]
    
    both_breaks = sub.loc[select_both_breaks, :]
    
    return clr_only_breaks, hifi_only_breaks, both_breaks


def compute_basic_stats(breaks):
    
    stats = breaks['length'].describe()
    print('count ', stats['count'])
    print('shortest ', stats['min'])
    print('median ', stats['50%'])
    print('75%ile ', stats['75%'])
    print('longest ', stats['max'])
    print('============================')
    return

for gf in gap_files:
    fname = os.path.basename(gf)
    if 'H64' not in gf:
        continue
    if 'MQ20' not in gf:
        continue
    df = pd.read_csv(gf, sep='\t', header=0)
    clr_breaks, hifi_breaks, both_breaks = select_tech_only_breaks(df)
    
    out_clr = fname.replace('.tsv', '.clr-breaks.bed')
    print(out_clr)
    compute_basic_stats(clr_breaks)
    out_clr = os.path.join('/home/local/work/data/hgsvc/t2tv1', out_clr)
    clr_breaks[['#chrom', 'start', 'end', 'name']].to_csv(out_clr, sep='\t', header=True, index=False)
    
    out_hifi = fname.replace('.tsv', '.hifi-breaks.bed')
    print(out_hifi)
    compute_basic_stats(hifi_breaks)
    out_hifi = os.path.join('/home/local/work/data/hgsvc/t2tv1', out_hifi)
    hifi_breaks[['#chrom', 'start', 'end', 'name']].to_csv(out_hifi, sep='\t', header=True, index=False)
    
    out_both = fname.replace('.tsv', '.both-breaks.bed')
    print(out_both)
    compute_basic_stats(both_breaks)
    out_both = os.path.join('/home/local/work/data/hgsvc/t2tv1', out_both)
    both_breaks[['#chrom', 'start', 'end', 'name']].to_csv(out_both, sep='\t', header=True, index=False)
    
    

Gaps_T2T_MQ20_H64.clr-breaks.bed
count  3.0
shortest  25217.0
median  91877.0
75%ile  100560.0
longest  109243.0
Gaps_T2T_MQ20_H64.hifi-breaks.bed
count  2.0
shortest  10989.0
median  26809.0
75%ile  34719.0
longest  42629.0
Gaps_T2T_MQ20_H64.both-breaks.bed
count  156.0
shortest  8.0
median  453688.5
75%ile  1633557.25
longest  65858861.0
Gaps_GRCh38_MQ20_H64.clr-breaks.bed
count  3.0
shortest  25184.0
median  91871.0
75%ile  100516.0
longest  109161.0
Gaps_GRCh38_MQ20_H64.hifi-breaks.bed
count  2.0
shortest  9387.0
median  26008.5
75%ile  34319.25
longest  42630.0
Gaps_GRCh38_MQ20_H64.both-breaks.bed
count  198.0
shortest  191.0
median  351296.0
75%ile  1286182.25
longest  65746119.0
