In [1]:

import os as os
import re as re
import gzip as gz

folder = '/TL/deep/fhgfs/projects/pebert/thesis/projects/statediff/references'

inputfile = os.path.join(folder, 'raw', 'EnsRB_v78_hg38_multi.gff.gz')
outputfile = os.path.join(folder, 'EnsRB_v78_hg38_multi_1-22X.bed')

feat_map = {'TF_binding_site': 'tfbs', 'enhancer': 'enhancer',
            'CTCF_binding_site': 'ctcf', 'open_chromatin_region': 'open',
            'promoter': 'promoter', 'promoter_flanking_region': 'flanking'}

chrom_re = re.compile('chr[0-9X]+(\s|$)')
buffer = []
with gz.open(inputfile, 'rt') as gff:
    for line in gff:
        if line:
            parts = line.split(maxsplit=8)
            chrom, start, end, regtype = parts[0], parts[3], parts[4], parts[2]
            mobj = chrom_re.match(chrom)
            if mobj is None:
                continue
            regtype = feat_map[regtype]
            regid = 'noid'
            for attr in parts[8].split(';'):
                if attr.startswith('ID='):
                    regid = attr.split('=')[1]
                    break
            if regid == 'noid':
                raise ValueError('No ID: {}'.format(line.strip()))
            buffer.append((chrom, start, end, regid, str(int(end) - int(start)), '.', regtype))
        
buffer = sorted(buffer, key=lambda x: (x[0], int(x[1]), int(x[2])))

out_header = ['#chrom', 'start', 'end', 'name', 'length', 'strand', 'feature']
with open(outputfile, 'w') as dump:
    buffer = ['\t'.join(t) for t in buffer]
    buffer.append('')
    _ = dump.write('\t'.join(out_header))
    _ = dump.write('\n')
    _ = dump.write('\n'.join(buffer))
