In [61]:
import os as os
import pandas as pd
import re as re

path = '/home/peter/work/temp/data/kmer'

tables = ['HG00733.1kg_il25k-125pe_short.hgsvc_pbsq2-clr_1000-flye.arrow-p1.ENSEMBLv98_RegBuild.99.tsv']

split_query = r'(?P<region>[A-Z]+)_(?P<ENSID>ENSR[0-9]+)::(?P<chrom>[A-Za-z0-9_]+):(?P<start>[0-9]+)\-(?P<end>[0-9]+)'

def derive_label(row):
    if not row['hg38']:
        return 'ref_unknown'
    if (row['short1'] or row['short2']) and (row['hap1'] and row['hap2']):
        return 'present'
    elif (row['hap1'] or row['hap2']) and not (row['short1'] and row['short2']):
        return 'non_illumina'
    elif row['hap1'] and not row['hap2'] and (row['short1'] or row['short2']):
        return 'hap_present'
    elif not row['hap1'] and row['hap2'] and (row['short1'] or row['short2']):
        return 'hap_present'
    elif not (row['hap1'] and row['hap2']) and (row['short1'] or row['short2']):
        return 'missing'
    else:
        return 'ref_only'

for tsv in tables:
    tsv_path = os.path.join(path, tsv)
    df = pd.read_csv(tsv_path, sep='\t', names=['query', 'short1', 'short2', 'hap1', 'hap2', 'hg38'],
                     header=0, skiprows=0)
    location = df['query'].str.extract(split_query, expand=True)
    location['start'] = location['start'].astype(int)
    location['end'] = location['end'].astype(int)
    df = pd.concat([df, location], axis=1)
    df['length'] = df['end'] - df['start']
    region_bp = df.groupby(['region'])['length'].sum()
    
    df['label'] = df[['short1', 'short2', 'hap1', 'hap2', 'hg38']].apply(derive_label, axis=1, raw=False)
    res_bp = df.groupby(['region', 'label'])['length'].sum()
    res_bp.names = ['length_bp']
    res_pct = (res_bp.divide(region_bp) * 100).round(3)
    res_pct.names = ['percent_bp']
    print(res_pct)
    res = pd.concat([res_pct, res_bp], axis=1)
    print(res)

region  label       
CTCF    hap_present      5.912
        missing          1.093
        non_illumina     0.449
        present         89.475
        ref_only         3.051
        ref_unknown      0.020
ENH     hap_present      5.279
        missing          0.641
        non_illumina     0.129
        present         91.158
        ref_only         2.788
        ref_unknown      0.005
OPEN    hap_present      4.110
        missing          0.813
        non_illumina     0.190
        present         92.950
        ref_only         1.928
        ref_unknown      0.009
PRFL    hap_present      5.870
        missing          0.803
        non_illumina     0.302
        present         90.297
        ref_only         2.718
        ref_unknown      0.010
PROM    hap_present      5.573
        missing          1.065
        non_illumina     2.020
        present         88.222
        ref_only         3.094
        ref_unknown      0.026
TFBS    hap_present      4.917
        missing   