In [3]:
import pandas as pd
import os as os
import collections as col

print_PAV_freebayes = False
print_PAV_hapcov = False

freebayes_calls = '/home/local/work/data/hgsvc/assembly_sv/freebayes_indels.h5'

select_from_annotation = [
    'ID',
    'DISC_CLASS',
    'SVLEN',
    'SVTYPE',
    'HAP',
    'POS',
    'END',
    'MERGE_SAMPLES'
]

pav_annotation = '/home/local/work/data/hgsvc/assembly_sv/PAV_sv-insdel_v3.h5'
def load_discovery_class(hdfstore=pav_annotation):
    with pd.HDFStore(hdfstore, 'r') as hdf:
        df = hdf['PAV_v3']
        dropped_sv = df.loc[df['quality'] == '0', select_from_annotation].copy()
        dropped_sv['sample'] = dropped_sv['MERGE_SAMPLES'].str.extract('(?P<sample>^[A-Z0-9]+)', expand=True)
        dropped_sv.sort_values('ID', inplace=True)
        dropped_sv.index = dropped_sv['ID']
        dropped_sv.drop('ID', axis=1, inplace=True)
    return dropped_sv


def extract_row_index(fname, basic=False):
    
    sample = fname.split('_', 1)[0]
    if 'pbsq2-ccs' in fname:
        platform = 'HiFi'
    elif 'pbsq2-clr' in fname:
        platform = 'CLR'
    else:
        raise
    if 'h1-un' in fname:
        hap = 10
    elif 'h2-un' in fname:
        hap = 20
    else:
        raise
    d = {
        'sample': sample,
        'platform': platform,
        'hap': hap
    }
    if not basic:
        if 'ins' in fname:
            svtype = 'INS'
        elif 'dels' in fname:
            svtype = 'DEL'
        else:
            raise
        if 'hom' in fname:
            genotype = 'HOM'
        elif 'het' in fname:
            genotype = 'HET'
        else:
            raise
        if 'out-hc' in fname:
            loc = 'low_conf'
        elif 'in-hc' in fname:
            loc = 'high_conf'
        else:
            raise
        d = {
            'sample': sample,
            'platform': platform,
            'hap': hap,
            'var_type': svtype,
            'genotype': genotype,
            'location': loc
        }
    return d
    

def parse_pav_overlaps(fpath):
    
    df = pd.read_csv(
        fpath,
        names=['ID', 'contig', 'overlap'],
        usecols=[3, 7, 10],
        sep='\t',
        header=None
    )
    df = df.loc[df['overlap'] > 0, :].copy()
    return df


def prepare_pav_summary(bed_path, cache_path):
    dropped_sv = load_discovery_class()

    columns = []
    with pd.HDFStore(freebayes_calls, 'r') as hdf:
        for bed_file in os.listdir(bed_path):
            h = extract_row_index(bed_file)
            fb_calls_key = os.path.join(h['sample'], h['platform'], 'HAP' + str(h['hap']))
            fb_calls = hdf[fb_calls_key]
            fb_calls = fb_calls.xs([h['var_type'], h['genotype']], level=['var_type', 'genotype'])
            fb_calls['contig'] = fb_calls['sequence'] + '_' + \
                                fb_calls['start'].astype(str) + '_' + \
                                fb_calls['stop'].astype(str)
            overlaps = parse_pav_overlaps(os.path.join(path, bed_file))
            if overlaps.shape[0] == 0:
                continue
            for idx, row in overlaps.iterrows():
                sv_data = dropped_sv.loc[row.ID]
                if sv_data['SVTYPE'] != h['var_type']:
                    continue
                if h['sample'] not in sv_data['MERGE_SAMPLES']:
                    sample_match = 'miss'
                else:
                    sample_match = 'match'
                src_var_len = int(fb_calls.loc[fb_calls['contig'] == row['contig'], 'variant_length'])
                pav_var_len = int(sv_data['SVLEN'])
                if src_var_len != pav_var_len:
                    length_match = 'partial'
                elif abs(src_var_len - pav_var_len) == 1 and src_var_len > 2:
                    raise ValueError('{} - {}'.format(bed_file, row.ID))
                else:
                    length_match = 'full'
                if int(row.overlap) == pav_var_len:
                    ovl_match = 'full'
                else:
                    ovl_match = 'partial'
                record = dict(h)
                record['ID'] = row.ID
                record['sample_match'] = sample_match
                record['var_len_match'] = length_match
                record['ovl_len_match'] = ovl_match
                record['overlap'] = row.overlap
                record['fb_var_len'] = src_var_len
                record['pav_var_len'] = sv_data['SVLEN']
                record['pav_disc_class'] = sv_data['DISC_CLASS']
                columns.append(record)
    df = pd.DataFrame.from_records(columns)
    df.to_hdf(cache_path, 'PAV_v3_fb', mode='w', format='fixed', complevel=9)
    return None


def prepare_hapcov_summary(tab_path, cache_path):
    dropped_sv = load_discovery_class()

    frames = []
    for tab_file in os.listdir(tab_path):
        if not (tab_file.endswith('h1-un.tab') or tab_file.endswith('h2-un.tab')):
            continue
        header = extract_row_index(tab_file.split('_AVG_')[1], basic=True)
        df = pd.read_csv(
            os.path.join(tab_path, tab_file),
            sep='\t',
            names=['ID', 'length', 'covered', 'cov_dp'],
            usecols=[0, 1, 2, 4]
        )
        df.index = df['ID']
        df = pd.concat([df, dropped_sv[['SVTYPE', 'sample']]], axis=1, ignore_index=False)
        df['cov_bp_pct'] = ((df['covered'] / df['length']) * 100).round(2)
        df.drop(['covered', 'length'], axis=1, inplace=True)
        
        df.index = pd.MultiIndex.from_arrays(
            [df['ID'], df['SVTYPE'], df['sample']],
            names=['ID', 'SVTYPE', 'sample'])
        df.drop(['ID', 'SVTYPE', 'sample'], axis=1, inplace=True)
        df.columns = pd.MultiIndex.from_tuples([
            (header['sample'], header['platform'], header['hap'], 'depth'),
            (header['sample'], header['platform'], header['hap'], 'ratio'),
        ], names=['sample', 'platform', 'hap', 'cov'])
        frames.append(df)
    
    frames = pd.concat(frames, axis=1, ignore_index=False)
    frames.to_hdf(cache_path, 'PAV_v3_cov', mode='w', format='fixed', complevel=9)
    return None


def prepare_hapassm_summary(tsv_path, cache_path):
    dropped_sv = load_discovery_class()

    frames = []
    for tsv_file in os.listdir(tsv_path):
        if not tsv_file.endswith('.tsv'):
            continue
        header = extract_row_index(tsv_file.split('_OVL_')[1], basic=True)
        df = pd.read_csv(
            os.path.join(tsv_path, tsv_file),
            sep='\t',
            names=['ID', 'overlap'],
            usecols=[3, 7],
            dtype={'ID': str, 'overlap': int}
        )
        num_aln = df['ID'].value_counts()
        bp_aln = df.groupby('ID')['overlap'].sum()
        df.drop_duplicates('ID', inplace=True)
        df.index = df['ID']
        df['num_aln'] = num_aln
        df['bp_aln'] = bp_aln
        df.loc[df['bp_aln'] == 0, 'num_aln'] = 0
        df.drop('overlap', axis=1, inplace=True)
        
        df = pd.concat([df, dropped_sv[['SVTYPE', 'SVLEN', 'sample']] ], axis=1, ignore_index=False)
        df['SVLEN'] = df['SVLEN'].astype('int64')

        df['aln_bp_pct'] = 0
        select_ins = df['SVTYPE'] == 'INS'
        select_ovl = df['bp_aln'] > 0
        df.loc[(select_ins & select_ovl), 'aln_bp_pct'] = 1
        select_del = df['SVTYPE'] == 'DEL'
        
        df.loc[select_del, 'aln_bp_pct'] = df.loc[select_del, 'bp_aln'] / df.loc[select_del, 'SVLEN']
        df['aln_bp_pct'] = (df['aln_bp_pct'] * 100).round(2)
                
        df.index = pd.MultiIndex.from_arrays(
            [df['ID'], df['SVTYPE'], df['sample']],
            names=['ID', 'SVTYPE', 'sample'])
        df.drop(['ID', 'SVTYPE', 'sample', 'SVLEN'], axis=1, inplace=True)
        
        df.columns = pd.MultiIndex.from_tuples([
            (header['sample'], header['platform'], header['hap'], 'num_aln'),
            (header['sample'], header['platform'], header['hap'], 'bp_aln'),
            (header['sample'], header['platform'], header['hap'], 'ratio'),
        ], names=['sample', 'platform', 'hap', 'cov'])
        frames.append(df)
    
    frames = pd.concat(frames, axis=1, ignore_index=False)
    frames.to_hdf(cache_path, 'PAV_v3_assm', mode='w', format='fixed', complevel=9)
    return None


if print_PAV_freebayes:
    path = '/home/local/work/data/hgsvc/assembly_sv/65-PAV-GRCh38_HGSVC2_noalt'
    cache_path = '/home/local/work/data/hgsvc/assembly_sv/cache.PAV_sv-insdel_v3.freebayes.h5'
    if not os.path.isfile(cache_path):
        prepare_pav_summary(path, cache_path)
    df = pd.read_hdf(cache_path, 'PAV_v3_fb')
    print(df.shape)
    print(df['ID'].unique().shape)

    df = df.loc[df['fb_var_len'] > 1, :].copy()
    print(df.shape)
    print(df['ID'].unique().shape)

    print(df['pav_disc_class'].value_counts())
    print(df['sample_match'].value_counts())
    print(df['var_len_match'].value_counts())
    print(df['location'].value_counts())
    print(df['ovl_len_match'].value_counts())

if print_PAV_hapcov:
    path = '/home/local/work/data/hgsvc/assembly_sv/hap_read_coverage'
    cache_path = '/home/local/work/data/hgsvc/assembly_sv/cache.PAV_sv-insdel_v3.hapcov.h5'
    if not os.path.isfile(cache_path):
        prepare_hapcov_summary(path, cache_path)
    df = pd.read_hdf(cache_path, 'PAV_v3_cov')
    stats = col.Counter()
    for s in df.index.get_level_values('sample').unique():
        rows = df.xs(s, level='sample', axis=0)
        try:
            cols = rows.xs(s, level='sample', axis=1)
        except KeyError:
            print('Skipping sample {}'.format(s))
            continue
        var_ins = cols.xs('INS', level='SVTYPE').xs('depth', level='cov', axis=1)
        var_del = cols.xs('DEL', level='SVTYPE').xs('ratio', level='cov', axis=1)
        for p, t in [('HiFi', 10), ('CLR', 30)]:
            try:
                tmp = var_ins.xs(p, level='platform', axis=1)
                stats[('INS', p, 'low_cov')] += (tmp < t).any(axis=1).sum()
            except KeyError:
                pass
            try:
                tmp = var_del.xs(p, level='platform', axis=1)
                stats[('DEL', p, 'low_cov')] += (tmp < 50).any(axis=1).sum()
            except KeyError:
                pass
    print(stats)

if True:
    path = '/home/local/work/data/hgsvc/assembly_sv/hap_assm_coverage'
    cache_path = '/home/local/work/data/hgsvc/assembly_sv/cache.PAV_sv-insdel_v3.hapassm.h5'
    if not os.path.isfile(cache_path):
        prepare_hapassm_summary(path, cache_path)
    df = pd.read_hdf(cache_path, 'PAV_v3_assm')
    stats = col.Counter()
    for s in df.index.get_level_values('sample').unique():
        rows = df.xs(s, level='sample', axis=0)
        try:
            cols = rows.xs(s, level='sample', axis=1)
        except KeyError:
            print('Skipping sample {}'.format(s))
            continue
        var_ins = cols.xs('INS', level='SVTYPE').xs('ratio', level='cov', axis=1)
        for p in ['HiFi', 'CLR']:
            try:
                tmp = var_ins.xs(p, level='platform', axis=1)
            except KeyError:
                continue
            missing = (tmp == 0).all(axis=1).sum()
            partial = (tmp == 0).any(axis=1).sum()
            partial -= missing
            stats[('INS', p, 'missing')] += missing
            stats[('INS', 'missing')] += missing
            stats[('INS', p, 'partial')] += partial
            stats[('INS', 'partial')] += partial

        var_del = cols.xs('DEL', level='SVTYPE').xs('ratio', level='cov', axis=1)
        for p in ['HiFi', 'CLR']:
            try:
                tmp = var_del.xs(p, level='platform', axis=1)
            except KeyError:
                continue
            missing = (tmp == 0).all(axis=1).sum()
            partial = (tmp == 0).any(axis=1).sum()
            partial -= missing
            low_cov = (tmp < 50).all(axis=1).sum()
            low_cov -= missing
            stats[('DEL', p, 'missing')] += missing
            stats[('DEL', 'missing')] += missing
            stats[('DEL', p, 'partial')] += partial
            stats[('DEL', 'partial')] += partial
            stats[('DEL', p, 'low_cov')] += low_cov
            stats[('DEL', 'low_cov')] += low_cov
    for k in sorted(stats.keys()):
        print(k, ': ', stats[k])


('DEL', 'CLR', 'low_cov') :  19
('DEL', 'CLR', 'missing') :  567
('DEL', 'CLR', 'partial') :  792
('DEL', 'HiFi', 'low_cov') :  17
('DEL', 'HiFi', 'missing') :  239
('DEL', 'HiFi', 'partial') :  524
('DEL', 'low_cov') :  36
('DEL', 'missing') :  806
('DEL', 'partial') :  1316
('INS', 'CLR', 'missing') :  731
('INS', 'CLR', 'partial') :  1145
('INS', 'HiFi', 'missing') :  267
('INS', 'HiFi', 'partial') :  363
('INS', 'missing') :  998
('INS', 'partial') :  1508
