In [1]:
import os as os
import pandas as pd
import re as re

path = '/home/pebert/work/data/kmer_tables'

tables = [f for f in os.listdir(path) if f.endswith('.tsv')]

split_query = r'(?P<region>[A-Z]+)_(?P<ENSID>ENSR[0-9]+)::(?P<chrom>[A-Za-z0-9_]+):(?P<start>[0-9]+)\-(?P<end>[0-9]+)'

def derive_label(row):
    if not row['hg38']:
        return 'ref_unknown'
    if (row['short1'] or row['short2']) and (row['hap1'] and row['hap2']):
        return 'present'
    elif (row['hap1'] or row['hap2']) and not (row['short1'] and row['short2']):
        return 'non_illumina'
    elif row['hap1'] and not row['hap2'] and (row['short1'] or row['short2']):
        return 'hap_present'
    elif not row['hap1'] and row['hap2'] and (row['short1'] or row['short2']):
        return 'hap_present'
    elif not (row['hap1'] and row['hap2']) and (row['short1'] or row['short2']):
        return 'missing'
    else:
        return 'ref_only'

summary_stats = []
for tsv in tables:
    read_type = tsv.split('_')[3].split('-')[-1]
    epsilon = tsv.split('.')[-2]
    tsv_path = os.path.join(path, tsv)
    df = pd.read_csv(tsv_path, sep='\t', names=['query', 'short1', 'short2', 'hap1', 'hap2', 'hg38'],
                     header=0, skiprows=0)
    location = df['query'].str.extract(split_query, expand=True)
    location['start'] = location['start'].astype(int)
    location['end'] = location['end'].astype(int)
    df = pd.concat([df, location], axis=1)
    df['length'] = df['end'] - df['start']
    region_bp = df.groupby(['region'])['length'].sum()
    
    df['label'] = df[['short1', 'short2', 'hap1', 'hap2', 'hg38']].apply(derive_label, axis=1, raw=False)
    res_bp = df.groupby(['region', 'label'])['length'].sum()
    res_bp = res_bp.rename('length_bp_{}_{}'.format(read_type, epsilon))
    
    res_pct = (res_bp.divide(region_bp) * 100).round(3)
    res_pct = res_pct.rename('length_pct_{}_{}'.format(read_type, epsilon))
    
    summary_stats.extend([res_bp, res_pct])
    
summary_stats = pd.concat(summary_stats, axis=1)

stats_path = os.path.join(path, 'summary_stats.tsv')

with open(stats_path, 'w') as dump:
    summary_stats.to_csv(dump, sep='\t')
