In [10]:

import os as os
import re as re
import csv as csv

import intervaltree as ivt
import numpy as np
import pandas as pd

# What does this do?
# clean GeneHancer annotation
# (remove all links to genes
# not included in the Gencode
# annotation; keep only 1-22,X)
# and generate the following
# files:
# - complete set: one entry for each enhancer/gene combination
# - intergenic: all enhancers not overlapping pc gene bodies
# - intragenic: all enhancers overlapping pc gene bodies

fhgfs_root = '/TL/deep/fhgfs/projects/pebert/thesis'

raw_enhancers = os.path.join(fhgfs_root, 'refdata', 'enhancer', 'bed_format', 'hg38_enh_genehancer.bed')
# coordinates are gene bodies
gencode_genes = os.path.join(fhgfs_root, 'projects', 'statediff', 'references', 'gencode_v21_pcg_bglist.bed')

genes = pd.read_csv(gencode_genes, sep='\t', header=0)
known_genes = dict((n,s) for n,s in zip(genes['name'], genes['symbol']))
known_symbols = dict((s,n) for n,s in zip(genes['name'], genes['symbol']))

# in IntervalTree, intervals are half-open,
# upper bound not included just as in BED files
chrom_trees = dict()
for chrom in genes['#chrom'].unique():
    sub = genes.loc[genes['#chrom'] == chrom, ['start', 'end']]
    chrom_trees[chrom] = ivt.IntervalTree.from_tuples([(s, e) for s,e in zip(sub['start'], sub['end'])])

known_chroms = re.compile('^chr[0-9X]+$')
known_enhancers = []
unknown_enhancers = []
with open(raw_enhancers, 'r', newline='') as table:
    reader = csv.DictReader(table, delimiter='\t')
    this_chrom = None
    gene_tree = None
    for row in reader:
        if known_chroms.match(row['#chrom']) is None:
            continue
        if row['#chrom'] != this_chrom:
            this_chrom = row['#chrom']
            gene_tree = chrom_trees[row['#chrom']]
        s, e = int(row['start']), int(row['end'])
        if gene_tree.overlaps(s, e):
            intragenic = 1
        else:
            intragenic = 0
        for n, s, a, d in zip(row['name'].split(','),
                              row['symbol'].split(','),
                              row['assoc_score'].split(','),
                              row['enh_gene_dist'].split(',')):
            try:
                symbol = known_genes[n]
                ensid = n
            except KeyError:
                try:
                    ensid = known_symbols[s]
                    symbol = s
                except KeyError:
                    tmp = dict(row)
                    tmp['name'] = 'unknown'
                    tmp['symbol'] = 'unknown'
                    tmp['intragenic'] = intragenic
                    tmp['assoc_score'] = float(a)
                    tmp['enh_gene_dist'] = int(d)
                    unknown_enhancers.append(tmp)
                    continue
            tmp = dict(row)
            tmp['name'] = ensid
            tmp['symbol'] = symbol
            tmp['intragenic'] = intragenic
            tmp['assoc_score'] = float(a)
            tmp['enh_gene_dist'] = int(d)
            known_enhancers.append(tmp)
            
known_enhancers = pd.DataFrame.from_dict(known_enhancers)
known_enhancers['start'] = known_enhancers['start'].astype(np.int32)
known_enhancers['end'] = known_enhancers['end'].astype(np.int32)
known_enhancers['strand'] = '.'
known_enhancers['length'] = known_enhancers['end'] - known_enhancers['start']
known_enhancers['enhancer_score'] = known_enhancers['enhancer_score'].astype(np.float32)
known_enhancers['is_elite'] = known_enhancers['is_elite'].astype(np.int8)
known_enhancers['intragenic'] = known_enhancers['intragenic'].astype(np.int8)
known_enhancers.sort_values(['#chrom', 'start', 'end', 'enhancer_score'], inplace=True)
# ['#chrom', 'GHid', 'assoc_score', 'cluster_id', 'end', 'enh_gene_dist',
#  'enhancer_score', 'intragenic', 'is_elite', 'name', 'start', 'symbol', 'length']

known_enhancers = known_enhancers[['#chrom', 'start', 'end', 'GHid', 'enhancer_score', 'strand',
                                   'name', 'symbol', 'assoc_score', 'is_elite', 'intragenic',
                                   'enh_gene_dist', 'cluster_id', 'length']]

unknown_enhancers = pd.DataFrame.from_dict(unknown_enhancers)
unknown_enhancers.drop_duplicates(['GHid'], inplace=True)
unknown_enhancers = unknown_enhancers.loc[~unknown_enhancers['GHid'].isin(known_enhancers['GHid']), :].copy()
unknown_enhancers.drop(['name', 'symbol'], axis=1, inplace=True)
unknown_enhancers['strand'] = '.'
unknown_enhancers['length'] = unknown_enhancers['end'].astype(np.int32) - unknown_enhancers['start'].astype(np.int32)
unknown_enhancers = unknown_enhancers[['#chrom', 'start', 'end', 'GHid', 'enhancer_score', 'strand',
                                       'assoc_score', 'is_elite', 'intragenic', 'enh_gene_dist',
                                       'cluster_id', 'length']]
unknown_enhancers.sort_values(['#chrom', 'start', 'end', 'enhancer_score'], inplace=True)
assert unknown_enhancers['GHid'].isin(known_enhancers['GHid']).sum() == 0, 'Overlap'

outfile_complete = os.path.join(os.path.dirname(gencode_genes), 'hg38_genehancer_other_complete.bed')
unknown_enhancers.to_csv(outfile_complete, sep='\t', header=True, index=False)


# dump complete set
outfile_complete = os.path.join(os.path.dirname(gencode_genes), 'hg38_genehancer_gencodeV21_complete.bed')
known_enhancers.to_csv(outfile_complete, sep='\t', header=True, index=False)

outfile_complete = outfile_complete.replace('.bed', '.h5')
with pd.HDFStore(outfile_complete, 'w') as hdf:
    hdf.put('genehancer/gencode_v21', known_enhancers, format='table')

# dump intragenic set
outfile_intra = os.path.join(os.path.dirname(gencode_genes), 'hg38_genehancer_gencodeV21_intragenic.bed')
subset = known_enhancers.loc[known_enhancers['intragenic'] == 1, :].copy()
subset.drop(['intragenic'], axis=1, inplace=True)
subset.to_csv(outfile_intra, sep='\t', header=True, index=False)

# dump intergenic set
outfile_inter = os.path.join(os.path.dirname(gencode_genes), 'hg38_genehancer_gencodeV21_intergenic.bed')
subset = known_enhancers.loc[known_enhancers['intragenic'] == 0, :].copy()
subset.drop(['intragenic'], axis=1, inplace=True)
subset.to_csv(outfile_inter, sep='\t', header=True, index=False)
