In [33]:
import pandas as pd
import os

"""
What does this do?
Summarize "articifical" read alignment coverage over 500 kbp windows for unscaffolded sequences.
Is preprocessing step before plotting the Supp. Fig. (ideogram)
"""


hifi_data = []
clr_data = []

input_path = '/home/local/work/data/hgsvc/figSX_panels/bng_unsupported/v12'

columns = [
    'chrom',
    'start',
    'end',
    'overlaps',
    'bp_overlap',
    'length',
    'fraction'
]

for table in os.listdir(input_path):
    if not table.endswith('win500k.tsv'):
        continue
    file_path = os.path.join(input_path, table)
    df = pd.read_csv(
        file_path,
        sep='\t',
        names=columns,
        usecols=['chrom', 'start', 'end', 'fraction'],
        index_col=['chrom', 'start', 'end']
    )
    if 'pbsq2-ccs' in table:
        hifi_data.append(df)
    elif 'pbsq2-clr' in table:
        clr_data.append(df)
    else:
        raise

hifi = pd.concat(hifi_data, axis=1, ignore_index=False)
clr = pd.concat(clr_data, axis=1, ignore_index=False)

hifi_avg_cov = hifi.sum(axis=1) / hifi.shape[1]
clr_avg_cov = clr.sum(axis=1) / clr.shape[1]

hifi_med_cov = hifi.median(axis=1)
clr_med_cov = clr.median(axis=1)

hifi['avg_cov'] = hifi_avg_cov
clr['avg_cov'] = clr_avg_cov

hifi = hifi[['avg_cov']]
clr = clr[['avg_cov']]

hifi['avg_cov'] = hifi['avg_cov'].round(3)
clr['avg_cov'] = clr['avg_cov'].round(3)

hifi = hifi.loc[hifi['avg_cov'] > 0, 'avg_cov']

out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38'
hifi.to_csv(
    os.path.join(out_path, 'hifi_unscf_avg_cov.bed'),
    sep='\t',
    header=True,
    index=True
)

clr = clr.loc[clr['avg_cov'] > 0, 'avg_cov']

clr.to_csv(
    os.path.join(out_path, 'clr_unscf_avg_cov.bed'),
    sep='\t',
    header=True,
    index=True
)
