In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
FILENAME_ROOT = "all-enhancers-with-network-features"

## Load enhancers, in hg19 coordinates

In [3]:
import pandas as pd 
import json

# all enhancers from Xu et al
# https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1008663
def get_enhancer_network_features(): 
  with open(f'{CONSTRAINT_TOOLS_DATA}/khurana/{FILENAME_ROOT}.hg19.sorted.json', 'r') as f:
    columns = json.load(f)

  df = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/khurana/{FILENAME_ROOT}.hg19.sorted.bed',
    sep='\t', 
    names=columns
  )
  df = df.rename(columns={
    'chromosome': 'chromosome_hg19',
    'start': 'start_hg19',
    'end': 'end_hg19',
  })
  return df 

get_enhancer_network_features()

Unnamed: 0,chromosome_hg19,start_hg19,end_hg19,enhancer_hg19,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,...,SG65_indegreeVarGene,SG68_outDegree,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue,delState
0,chr1,14800,15200,chr1-14800-15200,1,4.0,0.0,0.499665,0.103495,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248405,1,NotDeleted
1,chr1,16000,16400,chr1-16000-16400,1,4.0,0.0,0.164652,0.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002235,1,NotDeleted
2,chr1,20200,20400,chr1-20200-20400,1,4.0,0.0,0.650182,0.019668,3.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.108910,3,NotDeleted
3,chr1,79200,79800,chr1-79200-79800,1,4.0,0.0,0.194831,0.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125478,1,NotDeleted
4,chr1,534000,534400,chr1-534000-534400,2,7.0,36.0,0.084117,0.000000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.839952,0,NotDeleted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246023,chrX,155238400,155239000,chrX-155238400-155239000,2,20.0,4.0,0.456613,0.010633,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006429,4,NotDeleted
246024,chrX,155240600,155241800,chrX-155240600-155241800,2,20.0,4.0,0.456613,0.010633,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032977,1,NotDeleted
246025,chrX,155254000,155255800,chrX-155254000-155255800,2,20.0,4.0,0.456613,0.010633,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159252,2,NotDeleted
246026,chrX,155256800,155257000,chrX-155256800-155257000,2,20.0,4.0,0.456613,0.010633,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069605,1,NotDeleted


## Convert hg19 coordinates to hg38 coordinates 

In [4]:
import sys
sys.path.insert(0, f'{CONSTRAINT_TOOLS}/utilities')  

In [5]:
def reorder_columns(df):
  columns = df.columns.tolist()
  columns = columns[-3:] + columns[:-3]
  df = df[columns]
  return df

def hg19_to_hg38(): 
  hg38_hg19_coordinates = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/khurana/{FILENAME_ROOT}.hg38.hg19.bed', 
    sep='\t', 
    header=None,
  )
  hg38_hg19_coordinates.columns = ['chromosome_hg38', 'start_hg38', 'end_hg38', 'enhancer_hg19'] 
  hg19_coordinates_and_network_features = get_enhancer_network_features()
  hg38_coordinates_and_network_features = (
    hg19_coordinates_and_network_features
    .merge(
      hg38_hg19_coordinates, 
      on=['enhancer_hg19'],
      how='right' # only keep enhancers that can be mapped to hg38
    )
    .drop(columns=['chromosome_hg19', 'start_hg19', 'end_hg19', 'enhancer_hg19'])
  )
  hg38_coordinates_and_network_features = reorder_columns(hg38_coordinates_and_network_features)
  return hg38_coordinates_and_network_features

hg19_to_hg38()

Unnamed: 0,chromosome_hg38,start_hg38,end_hg38,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,...,SG65_indegreeVarGene,SG68_outDegree,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue,delState
0,chr1,14800,15200,1,4.0,0.0,0.499665,0.103495,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248405,1,NotDeleted
1,chr1,16000,16400,1,4.0,0.0,0.164652,0.000000,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002235,1,NotDeleted
2,chr1,20200,20400,1,4.0,0.0,0.650182,0.019668,3.0,0.00,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.108910,3,NotDeleted
3,chr1,79200,79800,1,4.0,0.0,0.194831,0.000000,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125478,1,NotDeleted
4,chr1,598620,599020,2,7.0,36.0,0.084117,0.000000,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.839952,0,NotDeleted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245971,chrX,156008735,156009335,2,20.0,4.0,0.456613,0.010633,3.5,2.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006429,4,NotDeleted
245972,chrX,156010935,156012135,2,20.0,4.0,0.456613,0.010633,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032977,1,NotDeleted
245973,chrX,156024335,156026135,2,20.0,4.0,0.456613,0.010633,2.0,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159252,2,NotDeleted
245974,chrX,156027135,156027335,2,20.0,4.0,0.456613,0.010633,1.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069605,1,NotDeleted


## Save hg38 coordinates of enhancers, together with network features 

In [6]:
import subprocess
import os 

def save_and_sort(): 
  df = hg19_to_hg38()
  path_root = f'{CONSTRAINT_TOOLS_DATA}/khurana/{FILENAME_ROOT}.hg38'
  df.to_csv(
    path_root + '.bed',
    sep='\t', 
    index=False, 
    header=False
  )
  subprocess.run(
    ["sort", "--version-sort", "-k1,1", "-k2,2n", "-k3,3n", path_root + '.bed'], 
    stdout=open(path_root + '.sorted.bed', "w")
  ) 
  os.remove(path_root + '.bed')
  with open(f'{path_root}.sorted.json', 'w') as f:
    json.dump(list(df.columns), f)  

save_and_sort()