In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pandas as pd 

hg19_coordinates_and_scores = pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/khurana/enhancers-and-khurana-scores.hg19.sorted.bed', 
  sep='\t', 
  header=None
)
hg19_coordinates_and_scores.columns = [
  'chromosome_hg19', 
  'start_hg19', 
  'end_hg19', 
  'deletion_status', 
  'predicted_lof_class', 
  'predicted_probability_of_lof_tolerance'
]      
hg19_coordinates_and_scores

Unnamed: 0,chromosome_hg19,start_hg19,end_hg19,deletion_status,predicted_lof_class,predicted_probability_of_lof_tolerance
0,chr1,14800,15200,NotDeleted,LoF-tolerant,0.673907
1,chr1,16000,16400,NotDeleted,LoF-tolerant,0.776159
2,chr1,20200,20400,NotDeleted,LoF-tolerant,0.689540
3,chr1,79200,79800,NotDeleted,LoF-tolerant,0.880216
4,chr1,534000,534400,NotDeleted,Low-LoF-tolerance,0.434242
...,...,...,...,...,...,...
245088,chrX,155238400,155239000,NotDeleted,LoF-tolerant,0.644715
245089,chrX,155240600,155241800,NotDeleted,LoF-tolerant,0.914595
245090,chrX,155254000,155255800,NotDeleted,LoF-tolerant,0.703746
245091,chrX,155256800,155257000,NotDeleted,LoF-tolerant,0.914595


In [3]:
import sys
sys.path.insert(0, f'{CONSTRAINT_TOOLS}/utilities')  

In [4]:
hg38_hg19_coordinates = pd.read_csv(
  f'{CONSTRAINT_TOOLS_DATA}/khurana/enhancers-and-khurana-scores.hg38.hg19.sorted.coordinates-only.bed', 
  sep='\t', 
  header=None,
)
hg38_hg19_coordinates.columns = ['chromosome_hg38', 'start_hg38', 'end_hg38', 'region_hg19', 'unknown'] 
hg38_hg19_coordinates

Unnamed: 0,chromosome_hg38,start_hg38,end_hg38,region_hg19,unknown
0,chr1,14800,15200,chr1:14801-15200,1
1,chr1,16000,16400,chr1:16001-16400,1
2,chr1,20200,20400,chr1:20201-20400,1
3,chr1,79200,79800,chr1:79201-79800,1
4,chr1,598620,599020,chr1:534001-534400,1
...,...,...,...,...,...
244980,chrX,156008735,156009335,chrX:155238401-155239000,1
244981,chrX,156010935,156012135,chrX:155240601-155241800,1
244982,chrX,156024335,156026135,chrX:155254001-155255800,1
244983,chrX,156027135,156027335,chrX:155256801-155257000,1


In [5]:
from pack_unpack import unpack

def parse_row(row):
  chrom, start, end = unpack(row['region_hg19'])
  return chrom, int(start)-1, int(end)

hg38_hg19_coordinates = (
  pd.concat([
    hg38_hg19_coordinates, 
    hg38_hg19_coordinates.apply(parse_row, axis=1).apply(pd.Series)
  ], axis=1)
  .rename(columns={
    0: 'chromosome_hg19',
    1: 'start_hg19',
    2: 'end_hg19',
  })
  .drop(columns=['region_hg19', 'unknown'])
)

hg38_hg19_coordinates

Unnamed: 0,chromosome_hg38,start_hg38,end_hg38,chromosome_hg19,start_hg19,end_hg19
0,chr1,14800,15200,chr1,14800,15200
1,chr1,16000,16400,chr1,16000,16400
2,chr1,20200,20400,chr1,20200,20400
3,chr1,79200,79800,chr1,79200,79800
4,chr1,598620,599020,chr1,534000,534400
...,...,...,...,...,...,...
244980,chrX,156008735,156009335,chrX,155238400,155239000
244981,chrX,156010935,156012135,chrX,155240600,155241800
244982,chrX,156024335,156026135,chrX,155254000,155255800
244983,chrX,156027135,156027335,chrX,155256800,155257000


In [7]:
hg38_coordinates_and_scores = (
  hg19_coordinates_and_scores
  .merge(
    hg38_hg19_coordinates, 
    on=['chromosome_hg19', 'start_hg19', 'end_hg19'],
    how='right'
  )
  .drop(columns=['chromosome_hg19', 'start_hg19', 'end_hg19'])
)

def reorder_columns(df):
  columns = df.columns.tolist()
  columns = columns[3:] + columns[:-3]
  df = df[columns]
  return df

hg38_coordinates_and_scores = reorder_columns(hg38_coordinates_and_scores)
hg38_coordinates_and_scores

Unnamed: 0,chromosome_hg38,start_hg38,end_hg38,deletion_status,predicted_lof_class,predicted_probability_of_lof_tolerance
0,chr1,14800,15200,NotDeleted,LoF-tolerant,0.673907
1,chr1,16000,16400,NotDeleted,LoF-tolerant,0.776159
2,chr1,20200,20400,NotDeleted,LoF-tolerant,0.689540
3,chr1,79200,79800,NotDeleted,LoF-tolerant,0.880216
4,chr1,598620,599020,NotDeleted,Low-LoF-tolerance,0.434242
...,...,...,...,...,...,...
244980,chrX,156008735,156009335,NotDeleted,LoF-tolerant,0.644715
244981,chrX,156010935,156012135,NotDeleted,LoF-tolerant,0.914595
244982,chrX,156024335,156026135,NotDeleted,LoF-tolerant,0.703746
244983,chrX,156027135,156027335,NotDeleted,LoF-tolerant,0.914595


In [8]:
import subprocess

def save_and_sort(df, filename_root): 
  path_root = f'{CONSTRAINT_TOOLS_DATA}/khurana/{filename_root}.hg38'
  df.to_csv(
    path_root + '.bed',
    sep='\t', 
    index=False, 
    header=False
  )
  subprocess.run(
    ["sort", "--version-sort", "-k1,1", "-k2,2n", "-k3,3n", path_root + '.bed'], 
    stdout=open(path_root + '.sorted.bed', "w")
  ) 

save_and_sort(
  df = hg38_coordinates_and_scores,
  filename_root = "enhancers-and-khurana-scores"
)