In [87]:
import os
import pandas as pd
import collections as col

"""
What does this do?
Dump Supp. Table "MAPQ60 contig coverage" relative to various subsets of the
full GRCh38 reference assembly. Requires output of "recomp_cov.py" script (line ~35ff)
"""

path2 = '/home/local/work/data/hgsvc/aln_summary/ps_ctgref_alnstore.h5'

males = [
    'HG02011',
    'HG03371',
    'HG03065',
    'NA19239',
    'NA19650',
    'HG00731',
    'NA18534',
    'HG00512',
    'HG01596',
    'NA24385',
    'HG00096',
    'HG01505',
    'NA20509',
    'HG03009',
    'HG03732',
    'HG02492'
]

tsv_path = '/home/local/work/data/hgsvc/aln_summary/regions'
cache_file = '/home/local/work/data/hgsvc/aln_summary/ctg_region_aln.cache.h5'

region_desc = {
    'tsv': 'whole_genome',
    'giemsa.tsv': 'wg_noGiemPosVar',
    'giemsa.nogap.tsv': 'wg_noGiemPosVar_noGap',
    'giemsa.nogap.122XY.tsv': 'wg_noGiemPosVar_noGap_122XY',
    'giemsa.nogap.122X.tsv': 'wg_noGiemPosVar_noGap_122X',
}

table_columns = [
    'chrom',
    'start',
    'end',
    'num_overlaps',
    'coverage_bp',
    'length',
    'coverage_frac'
]

def extract_sample_info(file_path):
    file_name = os.path.basename(file_path)
    sample = file_name.split('_')[0]
    if sample not in males:
        sex = 'F'
    else:
        sex = 'M'
    if 'clr' in file_name:
        platform = 'CLR'
    else:
        platform = 'HiFi'
    if 'h1-un' in file_name:
        hap = 'H1'
    else:
        hap = 'H2'
    region_id = region_desc[file_name.split('cov-in_GRCh38_HGSVC2_noalt.')[-1]]
    return sample, sex, platform, hap, region_id


def cache_region_coverage():
    
    region_length = set()
    region_count = set()
    
    temp_store = col.defaultdict(list)
        
    for table in os.listdir(tsv_path):
        if not table.endswith('.tsv'):
            continue
        table_path = os.path.join(tsv_path, table)
        sample, sex, platform, hap, region_set = extract_sample_info(table)
        aligns = pd.read_csv(
            table_path,
            sep='\t',
            header=None,
            names=table_columns
        )
        region_count.add((region_set, aligns.shape[0]))
        region_length.update(set((region_set, t[0], t[1], t[5]) for t in aligns.itertuples(index=False, name=None)))
        
        row_index = pd.MultiIndex.from_tuples(
            [(region_set, t[0], t[1]) for t in aligns.itertuples(index=False, name=None)],
            names=['region_set', 'chrom', 'start']
        )
        col_index = pd.MultiIndex.from_tuples(
            [(sample, sex, platform, hap)],
            names=['sample', 'sex', 'platform', 'hap']
        )
        
        aligns.index = row_index
        aligns.drop([
            'chrom',
            'start',
            'end',
            'num_overlaps',
            'length',
            'coverage_frac'],
            axis=1,
            inplace=True
        )
        aligns.columns = col_index
        temp_store[region_set].append(aligns)
        
    row_concat = []
    for dataframes in temp_store.values():
        tmp = pd.concat(dataframes, axis=1, ignore_index=False)
        row_concat.append(tmp)
        
    final = pd.concat(row_concat, axis=0, ignore_index=False)

    metadata_count = pd.DataFrame(
        sorted(region_count),
        columns=['region_set', 'region_count']
    )
    
    metadata_length = pd.DataFrame(
        sorted(region_length),
        columns=['region_set', 'chrom', 'start', 'length']
    )
    metadata_length.index = pd.MultiIndex.from_tuples(
        metadata_length[['region_set', 'chrom', 'start']].itertuples(index=False, name=None),
        names=['region_set', 'chrom', 'start']
    )
    metadata_length.drop(['region_set', 'chrom', 'start'], axis=1, inplace=True)
    
    with pd.HDFStore(cache_file, 'w', complevel=5) as hdf:
        hdf.put('cache', final, format='fixed')
        hdf.put('region_length', metadata_length, format='fixed')
        hdf.put('region_count', metadata_count, format='fixed')
    return
        
        
if not os.path.isfile(cache_file):
    cache_region_coverage()

with pd.HDFStore(cache_file, 'r') as hdf:
    data = hdf['cache']
    lengths = hdf['region_length']
    counts = hdf['region_count']
print(counts)
row_indices = []
rows = []
    
for reg in region_desc.values():
    print(reg)
    region_cov = data.xs(reg, level='region_set').sum(axis=0)
    rows.append(region_cov)
    row_indices.append((reg, 'bp_covered'))
    
    genome_length = int(lengths.xs(reg, level='region_set').sum(axis=0))
    print(genome_length)
    region_cov_pct = ((region_cov / genome_length) * 100).round(2)
    rows.append(region_cov_pct)
    row_indices.append((reg, 'pct_covered'))
    
summary = pd.DataFrame(
    rows,
    index=pd.MultiIndex.from_tuples(
        row_indices,
        names=['region_set', 'statistic']
    )
)

summary = summary.transpose()

dump_tsv = '/home/local/work/data/hgsvc/aln_summary/genome_coverage.tsv'

summary.sort_index(level=['platform', 'sample', 'hap'], inplace=True)

summary.to_csv(
    dump_tsv,
    sep='\t',
    header=True,
    index=True
)

                    region_set  region_count
0              wg_noGiemPosVar           286
1        wg_noGiemPosVar_noGap           823
2   wg_noGiemPosVar_noGap_122X           508
3  wg_noGiemPosVar_noGap_122XY           561
4                 whole_genome           194
whole_genome
3099750718
wg_noGiemPosVar
2467823303
wg_noGiemPosVar_noGap
2432321108
wg_noGiemPosVar_noGap_122XY
2421332465
wg_noGiemPosVar_noGap_122X
2395484832
