In [3]:
import os as os
import pandas as pd

"""
What does this do?
Convert Bionano Supp. Tables to regular BED format for intersect operations
Is a simple prerequisite for cross-checking how many Bionano SV calls overlap
with Illumina SV calls (missed by PAV)

NB for all Bionano annotations: Bionano's cross-sample merging strategy leads to
the peculiarity that two clusteres can have identical genomic coordinates, but are
considered disjoint by Bionano. Intersect operations with other region set annotations
have to take that into account when counting overlaps.
"""

path = '/home/local/work/data/hgsvc/bng_tables'

bed_columns = {
    'Table_S13-2_Bionano_high_confidence_regions_SVs_5kbp_clusters.tsv': [
        'Chr', 'Start', 'End', 'ClusterID', 'Type', 'ClusterSVsize', 'CleanNumofSamples', 'SegDupOverlap'
    ],
    'Table_S13-4_Bionano_unique_clusters.tsv': [
        'Chr', 'Start', 'End', 'ClusterID', 'Type', 'ClusterSVsize', 'CleanNumofSamples', 'SegDupOverlap'
    ],
    'Table_S13-3_Bionano_clusters_with_atleast1_PAVnonoverlapping_call.tsv': [
        'Chr',
        'Start',
        'End',
        'ClusterID',
        'Type',
        'ClusterSVsize',
        'CleanNumofSamples',
        'SegDupOverlap',
        'NClustAtRegion'
    ]
}

for table in os.listdir(path):
    if not table.endswith('.tsv'):
        continue
    if not ('13-4' in table or '13-3' in table):
        continue
    print(table)
    table_path = os.path.join(path, table)
    header = None
    rows = []
    with open(table_path, 'r') as dump:
        for line in dump:
            if line.startswith('##'):
                continue
            elif line.startswith('#'):
                header = line.strip().strip('#').split('\t')
            else:
                rows.append(line.strip().split('\t'))
    df = pd.DataFrame(
        rows,
        columns=header
    )
    print(df.head())
    print(df.shape[0])
    print(df['Type'].value_counts())
    print('========')

    try:
        dump_columns = bed_columns[table]
    except KeyError:
        continue
    out_path = os.path.join(path, table.replace('.tsv', '.bed'))
    df['Chr'] = 'chr' + df['Chr']
    with open(out_path, 'w') as dump:
        _ = dump.write('#')
        df[dump_columns].to_csv(dump, sep='\t', index=False, header=True)
    

Table_S13-4_Bionano_unique_clusters.tsv
  ClusterID       Type Chr    Start      End ClusterSVsize CleanNumofSamples  \
0       180   deletion   1    40859   692619        650347                 1   
1       137  insertion   1   390079   401006          5940                 2   
2       304   deletion   1   628728   635883          5921                 3   
3       274  insertion   1   857960  1001847          8510                 1   
4        78  insertion   1  1244412  1436832         11008                 1   

  InsCls DelCls NClustAtRegion InsMem DelMem NMemAtRegion SegDupOverlap  \
0      1      2              3      2      4            6        992797   
1      1      1              2      2      1            3         32781   
2      0      2              2      0      4            4          7155   
3      2      0              2      2      0            2         44948   
4      3      0              3     17      0           17             0   

                            