In [130]:
import pandas as pd
import os
import collections as col

path = '/home/local/work/pipeline/run_folder/output/evaluation/break_analysis/tables'
name = 'GRCh38_HGSVC2_noalt.BNGuniqDEL-ro80.h5'
file_path = os.path.join(path, name)

df = pd.read_hdf(file_path)

n_gaps = ('nucprof', 'pct', 'nucN')
dropped_sv = ('DELnobngV3', 'pct', 'coverage')
issues = ('issues', 'pct', 'coverage')
inversions = ('INVv3', 'pct', 'coverage')
breaks = ('H64breaks', 'pct', 'coverage')

selectors = [
    n_gaps,
    issues,
    dropped_sv,
    inversions,
    breaks
]

threshold = 0.1
total_regions = df.shape[0]
print('total ', total_regions)
print('=============')

affected_regions = col.defaultdict(set)
affected_count = col.Counter()

dump = df.copy()

for selector in selectors:
    select_threshold = df.loc[:, selector] > threshold
    selected_regions = df.loc[select_threshold, selector]
    affected_count[(selector, 'raw')] += selected_regions.shape[0]
    selected_regions = set(selected_regions.index.tolist())
    
    for other_regions in affected_regions.values():
        selected_regions = selected_regions - other_regions
    affected_count[(selector, 'prio_filtered')] += len(selected_regions)
    affected_regions[selector] = selected_regions
    if 'H64breaks' not in selector:
        affected_count['total'] += len(selected_regions)
    ovl_column = ('BNG', 'overlap', selector[0])
    if ovl_column[2] == 'nucprof':
        ovl_column = ('BNG', 'overlap', 'nucN')
    dump[ovl_column] = 0
    dump.loc[dump.index.isin(selected_regions), ovl_column] = 1

dump.drop([c for c in dump.columns if 'BNG' not in c], axis=1, inplace=True)
dump.columns = ['_'.join(c) for c in dump.columns]
dump.reset_index(drop=False, inplace=True)

simple_table = file_path.replace('.h5', '.reduced.tsv')
dump.to_csv(
    simple_table,
    sep='\t',
    header=True,
    index=False
)

dump_stats = file_path.replace('.h5', '.filter-stats.txt')

with open(dump_stats, 'w') as dump_file:
    _ = dump_file.write('Region_set\t' + name + '\n')
    _ = dump_file.write('Total_count\t{}\n'.format(dump.shape[0]))
    _ = dump_file.write('Affected_count\t{}\n'.format(affected_count['total']))
    for selector in selectors:
        if selector[0] == 'H64breaks':
            _ = dump_file.write('=== ' + selector[0] + '\n')
        else:
            _ = dump_file.write(selector[0] + '\n')
        print(selector)
        raw = affected_count[(selector, 'raw')]
        print('Raw ', raw)
        _ = dump_file.write('Raw_count\t{}\n'.format(raw))
        filtered = affected_count[(selector, 'prio_filtered')]
        _ = dump_file.write('PrioFiltered_count\t{}\n'.format(filtered))
        print('Filtered ', filtered)
        pct = round(filtered / total_regions * 100, 2)
        _ = dump_file.write('PrioFiltered_pct\t{}\n'.format(pct))
        print(pct, '%')
        print('============')



total  421
('nucprof', 'pct', 'nucN')
Raw  87
Filtered  87
20.67 %
('issues', 'pct', 'coverage')
Raw  80
Filtered  47
11.16 %
('DELnobngV3', 'pct', 'coverage')
Raw  100
Filtered  64
15.2 %
('INVv3', 'pct', 'coverage')
Raw  137
Filtered  67
15.91 %
('H64breaks', 'pct', 'coverage')
Raw  377
Filtered  128
30.4 %
