In [37]:
"""
This is a different approach to the assembly gap/break analysis that was not used
in the Science 2021 paper (already includes T2T as reference assembly)
- this notebook aggregates data on the basis of the contig to reference alignment BED files
and produces filtered output for the corresponding eval notebook
"""

import os
import pandas as pd

path = '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered'
path = '/home/local/work/data/hgsvc/contig_aln_bed_t2t'
out_lenient = '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt00'
out_lenient = '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt00'
out_stringent = '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt20'
out_stringent = '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt20'
os.makedirs(out_lenient, exist_ok=True)
os.makedirs(out_stringent, exist_ok=True)

names = ['chrom', 'start', 'end', 'name', 'mapq', 'strand']
chrom_match = 'chr[0-9XY]+$'

for bedfile in os.listdir(path):
    if not bedfile.endswith('.bed'):
        continue
    file_path = os.path.join(path, bedfile)
    if 'h1-un' in bedfile:
        hap = 'H1'
    else:
        hap = 'H2'
    sample, _, platform = bedfile.split('_map-to_')[0].rsplit('_', 1)[0].split('_')
    if 'clr' in platform:
        platform = 'CLR'
    else:
        platform = 'HiFi'
    bed = pd.read_csv(file_path, sep='\t', header=None, names=names)
    bed = bed.loc[bed['chrom'].str.match(chrom_match), :].copy()
    bed['name'] = sample + '_' + platform + '_' + hap + '_' + bed['name']
    
    bed_lenient = bed.loc[bed['mapq'] > 0, :]
    file_lenient = bedfile.replace('.bed', '.mq-grt00.bed')
    path_lenient = os.path.join(out_lenient, file_lenient)
    bed_lenient.to_csv(path_lenient, sep='\t', header=False, index=False)
    
    bed_stringent = bed.loc[bed['mapq'] > 20, :]
    file_stringent = bedfile.replace('.bed', '.mq-grt20.bed')
    path_stringent = os.path.join(out_stringent, file_stringent)
    bed_stringent.to_csv(path_stringent, sep='\t', header=False, index=False)


def load_all_complements(folder):
    
    #children = ['HG00733', 'NA19240', 'HG00514']
    loaded = []
    bed_files = [os.path.join(folder, b) for b in os.listdir(folder) if b.endswith('complement.bed')]
    for b in bed_files:
        sample = os.path.basename(b).split('_')[0]
        #if sample in children:
        #    continue
        if 'h1-un' in b:
            hap = 'H1'
        else:
            hap = 'H2'
        if 'pbsq2-ccs' in b:
            platform = 'HiFi'
        else:
            platform = 'CLR'
        name = '_'.join([sample, platform, hap])
        df = pd.read_csv(b, sep='\t', header=None, names=['chrom', 'start', 'end'])
        df['name'] = name
        loaded.append(df)
    loaded = pd.concat(loaded, axis=0, ignore_index=False)
    loaded.sort_values(['chrom', 'start', 'end'], inplace=True)
    loaded.reset_index(drop=True, inplace=True)
    return loaded
    
mq00_files = load_all_complements(out_lenient)
mq00_merged_out = os.path.join(out_lenient, 'all_grt00_concat.tsv')
mq00_files.to_csv(mq00_merged_out, sep='\t', header=False, index=False)

mq20_files = load_all_complements(out_stringent)
mq20_merged_out = os.path.join(out_stringent, 'all_grt20_concat.tsv')
mq20_files.to_csv(mq20_merged_out, sep='\t', header=False, index=False)

def unflatten_merged_table(file_path, name_prefix):
    
    records = []
    current_chrom = None
    region_counter = 0
    with open(file_path, 'r') as table:
        for line in table:
            clr_count = 0
            hifi_count = 0
            chrom, start, end, haplotypes = line.strip().split('\t')
            if current_chrom != chrom:
                region_counter = 0
                current_chrom = chrom
            region_counter += 1
            entry = {
                'chrom': chrom,
                'start': int(start),
                'end': int(end),
                'name': '_'.join([name_prefix, str(region_counter), start]),
                'length': int(end) - int(start)
            }
            for h in haplotypes.split(','):
                if 'CLR' in h:
                    clr_count += 1
                else:
                    hifi_count += 1
                entry[h] = 1
            entry['COUNT_CLR'] = clr_count
            entry['COUNT_HIFI'] = hifi_count
            records.append(entry)
    df = pd.DataFrame.from_records(records)
    df.fillna(0, inplace=True)
    column_typer = dict((c, int) for c in df.columns if c not in ['chrom', 'name'])
    column_typer['chrom'] = str
    df = df.astype(column_typer)
    col_sort = ['chrom', 'start', 'end', 'name', 'length', 'COUNT_CLR', 'COUNT_HIFI']
    sorted_samples = sorted([c for c in df.columns if c not in col_sort])
    col_sort.extend(sorted_samples)
    df = df[col_sort]
    return df

file_paths = [
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt20/all_grt20_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt20/all_grt20_no-child_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt00/all_grt00_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed/unfiltered/grt00/all_grt00_no-child_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt00/all_grt00_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt00/all_grt00_no-child_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt20/all_grt20_merged.tsv',
    '/home/local/work/data/hgsvc/contig_aln_bed_t2t/grt20/all_grt20_no-child_merged.tsv',
]

name_prefixes = [
    'GRCh38_MQ20_H70',
    'GRCh38_MQ20_H64',
    'GRCh38_MQ00_H70',
    'GRCh38_MQ00_H64',
    'T2T_MQ00_H70',
    'T2T_MQ00_H64',
    'T2T_MQ20_H70',
    'T2T_MQ20_H64',
]

for fp, np in zip(file_paths, name_prefixes):
    df = unflatten_merged_table(fp, np)
    out_file = os.path.join(os.path.dirname(fp), 'Gaps_' + np + '.tsv')
    with open(out_file, 'w') as dump:
        _ = dump.write('#')
        df.to_csv(dump, sep='\t', header=True, index=False)
    