In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

## Load in regular enhancers, disease enhancers, and pacbio-deleted enhancers, in hg19 coordinates

In [2]:
import pandas as pd 

enhancers_and_khurana_scores_hg19 = pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/khurana/enhancers-and-khurana-scores.hg19.sorted.bed', 
  sep='\t', 
  header=None
)
enhancers_and_khurana_scores_hg19.columns = [
  'chromosome_hg19', 
  'start_hg19', 
  'end_hg19', 
  'deletion_status', 
  'predicted_lof_class', 
  'predicted_probability_of_lof_tolerance'
]      
enhancers_and_khurana_scores_hg19

Unnamed: 0,chromosome_hg19,start_hg19,end_hg19,deletion_status,predicted_lof_class,predicted_probability_of_lof_tolerance
0,chr1,14800,15200,NotDeleted,LoF-tolerant,0.673907
1,chr1,16000,16400,NotDeleted,LoF-tolerant,0.776159
2,chr1,20200,20400,NotDeleted,LoF-tolerant,0.689540
3,chr1,79200,79800,NotDeleted,LoF-tolerant,0.880216
4,chr1,534000,534400,NotDeleted,Low-LoF-tolerance,0.434242
...,...,...,...,...,...,...
245088,chrX,155238400,155239000,NotDeleted,LoF-tolerant,0.644715
245089,chrX,155240600,155241800,NotDeleted,LoF-tolerant,0.914595
245090,chrX,155254000,155255800,NotDeleted,LoF-tolerant,0.703746
245091,chrX,155256800,155257000,NotDeleted,LoF-tolerant,0.914595


In [3]:
disease_enhancers_and_khurana_scores_hg19 = pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/khurana/disease-enhancers-and-khurana-scores.hg19.sorted.bed', 
  sep='\t', 
  header=None
)
disease_enhancers_and_khurana_scores_hg19.columns = [
  'chromosome_hg19', 
  'start_hg19', 
  'end_hg19', 
  'disease', 
  'predicted_probability_of_lof_tolerance',
  'predicted_lof_class', 
]      
disease_enhancers_and_khurana_scores_hg19

Unnamed: 0,chromosome_hg19,start_hg19,end_hg19,disease,predicted_probability_of_lof_tolerance,predicted_lof_class
0,chr1,21672600,21674600,allDisease,0.319919,LoFinT
1,chr1,94375400,94376600,allDisease,0.492920,LoFinT
2,chr1,155263000,155263400,allDisease,0.207529,LoFinT
3,chr1,160002200,160003600,allDisease,0.423619,LoFinT
4,chr1,173884400,173884600,allDisease,0.925524,LoFT
...,...,...,...,...,...,...
85,chr22,39539400,39541000,allDisease,0.454700,LoFinT
86,chr22,40740400,40742000,allDisease,0.299276,LoFinT
87,chrX,70438600,70439000,allDisease,0.540452,LoFT
88,chrX,138612000,138612800,allDisease,0.831452,LoFT


In [4]:
pacbio_deleted_enhancers_and_khurana_scores_hg19 = pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/khurana/pacbio-deleted-enhancers-and-khurana-scores.hg19.sorted.bed', 
  sep='\t', 
  header=None
)
pacbio_deleted_enhancers_and_khurana_scores_hg19.columns = [
  'chromosome_hg19', 
  'start_hg19', 
  'end_hg19', 
  'family', 
  'predicted_probability_of_lof_tolerance',
  'predicted_lof_class', 
]      
pacbio_deleted_enhancers_and_khurana_scores_hg19

Unnamed: 0,chromosome_hg19,start_hg19,end_hg19,family,predicted_probability_of_lof_tolerance,predicted_lof_class
0,chr1,16843400,16843600,LoFT_trio,0.802718,LoFT
1,chr1,16843800,16844200,LoFT_trio,0.867843,LoFT
2,chr1,16852000,16852600,LoFT_trio,0.702235,LoFT
3,chr2,5925600,5925800,LoFT_trio,0.95494,LoFT
4,chr3,121262600,121262800,LoFT_trio,0.710788,LoFT
5,chr4,70018400,70019400,LoFT_trio,0.695622,LoFT
6,chr4,82110000,82110400,LoFT_trio,0.909137,LoFT
7,chr9,69464400,69465400,LoFT_trio,0.863101,LoFT
8,chr11,3283800,3284200,LoFT_trio,0.881633,LoFT
9,chr11,3357200,3357600,LoFT_trio,0.719512,LoFT


## Convert hg19 coordinates to hg38 coordinates 

In [5]:
import sys
sys.path.insert(0, f'{CONSTRAINT_TOOLS}/utilities')  

In [6]:
from pack_unpack import unpack

def parse_row(row):
  chrom, start, end = unpack(row['region_hg19'])
  return chrom, int(start)-1, int(end)

def reorder_columns(df):
  columns = df.columns.tolist()
  columns = columns[-3:] + columns[:-3]
  df = df[columns]
  return df

def create_hg38_coordinates_and_khurana_scores(prefix): 
  hg38_hg19_coordinates = pd.read_csv(
    f'{CONSTRAINT_TOOLS_DATA}/khurana/{prefix}-and-khurana-scores.hg38.hg19.sorted.coordinates-only.bed', 
    sep='\t', 
    header=None,
  )
  hg38_hg19_coordinates.columns = ['chromosome_hg38', 'start_hg38', 'end_hg38', 'region_hg19', 'unknown'] 
  hg38_hg19_coordinates = (
    pd.concat([
      hg38_hg19_coordinates, 
      hg38_hg19_coordinates.apply(parse_row, axis=1).apply(pd.Series)
    ], axis=1)
    .rename(columns={
      0: 'chromosome_hg19',
      1: 'start_hg19',
      2: 'end_hg19',
    })
    .drop(columns=['region_hg19', 'unknown'])
  )
  variables = { 
    'enhancers': enhancers_and_khurana_scores_hg19,
    'disease-enhancers': disease_enhancers_and_khurana_scores_hg19,
    'pacbio-deleted-enhancers': pacbio_deleted_enhancers_and_khurana_scores_hg19,
  }
  hg19_coordinates_and_scores = variables[prefix]
  hg38_coordinates_and_scores = (
    hg19_coordinates_and_scores
    .merge(
      hg38_hg19_coordinates, 
      on=['chromosome_hg19', 'start_hg19', 'end_hg19'],
      how='right'
    )
    .drop(columns=['chromosome_hg19', 'start_hg19', 'end_hg19'])
  )
  hg38_coordinates_and_scores = reorder_columns(hg38_coordinates_and_scores)
  return hg38_coordinates_and_scores

create_hg38_coordinates_and_khurana_scores('disease-enhancers')

Unnamed: 0,chromosome_hg38,start_hg38,end_hg38,disease,predicted_probability_of_lof_tolerance,predicted_lof_class
0,chr1,21346107,21348107,allDisease,0.319919,LoFinT
1,chr1,93909844,93911044,allDisease,0.492920,LoFinT
2,chr1,155293209,155293609,allDisease,0.207529,LoFinT
3,chr1,160032410,160033810,allDisease,0.423619,LoFinT
4,chr1,173915262,173915462,allDisease,0.925524,LoFT
...,...,...,...,...,...,...
85,chr22,39143395,39144995,allDisease,0.454700,LoFinT
86,chr22,40344396,40345996,allDisease,0.299276,LoFinT
87,chrX,71218750,71219150,allDisease,0.540452,LoFT
88,chrX,139529841,139530641,allDisease,0.831452,LoFT


## Save hg38 coordinates of enhancers, together with Khurana scores 

In [7]:
import subprocess
import os 

def save_and_sort(df, filename_root): 
  path_root = f'{CONSTRAINT_TOOLS_DATA}/khurana/{filename_root}.hg38'
  df.to_csv(
    path_root + '.bed',
    sep='\t', 
    index=False, 
    header=False
  )
  subprocess.run(
    ["sort", "--version-sort", "-k1,1", "-k2,2n", "-k3,3n", path_root + '.bed'], 
    stdout=open(path_root + '.sorted.bed', "w")
  ) 
  os.remove(path_root + '.bed')

In [8]:
save_and_sort(
  df = create_hg38_coordinates_and_khurana_scores('enhancers'),
  filename_root = "enhancers-and-khurana-scores"
)

In [9]:
save_and_sort(
  df = create_hg38_coordinates_and_khurana_scores('disease-enhancers'),
  filename_root = "disease-enhancers-and-khurana-scores"
)

In [10]:
save_and_sort(
  df = create_hg38_coordinates_and_khurana_scores('pacbio-deleted-enhancers'),
  filename_root = "pacbio-deleted-enhancers-and-khurana-scores"
)