## Khurana enhancers

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
# Khurana paper: 
# https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1008663

import pandas as pd

# Read in the Excel file, which contains multiple sheets
khurana_table_S2 = pd.read_excel(f'{CONSTRAINT_TOOLS_DATA}/khurana/table-S2.xlsx', sheet_name=None)

In [3]:
khurana_table_S2.keys()

dict_keys(['SupplementaryTable3', 'LoF-tolerant enhancers', 'Low-LoF-tolerance enhancers', 'Feature importance'])

In [4]:
enhancers_and_khurana_scores = khurana_table_S2['SupplementaryTable3']
enhancers_and_khurana_scores

Unnamed: 0,enhancer,original,predict LoF-tolerant,prob LoF-tolerance
0,chr13-101408800-101409200,NotDeleted,Low-LoF-tolerance,0.248020
1,chr18-45880000-45880200,NotDeleted,LoF-tolerant,0.646635
2,chr5-174837800-174840000,NotDeleted,LoF-tolerant,0.881760
3,chr9-115430200-115432000,NotDeleted,LoF-tolerant,0.903623
4,chr11-134077600-134078200,NotDeleted,LoF-tolerant,0.729862
...,...,...,...,...
245088,chr22-51047400-51049200,NotDeleted,LoF-tolerant,0.592781
245089,chr5-55940400-55941400,NotDeleted,LoF-tolerant,0.647929
245090,chr10-61223600-61224000,NotDeleted,LoF-tolerant,0.516712
245091,chr14-63648400-63648600,NotDeleted,LoF-tolerant,0.665555


In [5]:
lof_tolerant_enhancers = khurana_table_S2['LoF-tolerant enhancers']
lof_tolerant_enhancers

Unnamed: 0,chrom,start,end
0,chr1,766600,768200
1,chr1,1443600,1445200
2,chr1,8184200,8185000
3,chr1,8189600,8190000
4,chr1,9595800,9596000
...,...,...,...
881,chrX,146848600,146849600
882,chrX,150586000,150586200
883,chrX,150877400,150877600
884,chrX,150878000,150878200


In [6]:
low_lof_tolerance_enhancers = khurana_table_S2['Low-LoF-tolerance enhancers']
low_lof_tolerance_enhancers

Unnamed: 0,chrom,start,end
0,chr1,62053400,62055600
1,chr1,87821600,87822800
2,chr1,87821600,87822800
3,chr1,169910800,169912800
4,chr2,66296800,66298400
5,chr2,164661800,164662000
6,chr2,172936600,172938600
7,chr2,176940200,176940800
8,chr3,9469600,9471800
9,chr3,147029600,147031400


In [7]:
disease_enhancers_and_khurana_scores = pd.read_csv(f"{CONSTRAINT_TOOLS_DATA}/khurana/disease-enhancers.khurana-scores.hg19.csv", sep=',')
disease_enhancers_and_khurana_scores = disease_enhancers_and_khurana_scores.rename(columns={'Unnamed: 0': 'enhancer'})
disease_enhancers_and_khurana_scores

Unnamed: 0,enhancer,diseaseType,yPredict,predictedType
0,chr4-6299200-6301200,allDisease,0.000000,
1,chr8-31499200-31499600,allDisease,0.069214,LoFinT
2,chr3-52757800-52759200,allDisease,0.163410,LoFinT
3,chr13-72427800-72429200,allDisease,0.179379,LoFinT
4,chr1-155263000-155263400,allDisease,0.207529,LoFinT
...,...,...,...,...
85,chr9-22076800-22077000,allDisease,0.872610,LoFT
86,chrX-154247800-154248000,allDisease,0.904522,LoFT
87,chr19-19437800-19438000,allDisease,0.904585,LoFT
88,chr6-12892800-12895000,allDisease,0.910542,LoFT


In [8]:
pacbio_deleted_enhancers_and_khurana_scores = pd.read_csv(f"{CONSTRAINT_TOOLS_DATA}/khurana/pacbio-deleted-enhancers.khurana-scores.hg19.csv", sep=',')
pacbio_deleted_enhancers_and_khurana_scores = pacbio_deleted_enhancers_and_khurana_scores.rename(columns={'Unnamed: 0': 'enhancer'})
pacbio_deleted_enhancers_and_khurana_scores

Unnamed: 0,enhancer,diseaseType,yPredict,predictedType
0,chrX-17787000-17789200,LoFT_trio,0.512068,LoFT
1,chrX-153505400-153505800,LoFT_trio,0.656411,LoFT
2,chr19-8362600-8363400,LoFT_trio,0.663404,LoFT
3,chr17-6872400-6873400,LoFT_trio,0.676659,LoFT
4,chr4-70018400-70019400,LoFT_trio,0.695622,LoFT
5,chr1-16852000-16852600,LoFT_trio,0.702235,LoFT
6,chr3-121262600-121262800,LoFT_trio,0.710788,LoFT
7,chr11-3357200-3357600,LoFT_trio,0.719512,LoFT
8,chr19-8361800-8362200,LoFT_trio,0.75362,LoFT
9,chrX-152538800-152539000,LoFT_trio,0.78,LoFT


In [9]:
def get_enhancer_network_features(): 
  # all enhancers from Xu et al
  # https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1008663
  df = pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/khurana/allData_github.csv')
  df = df.rename(columns={'Unnamed: 0':'enhancer'})
  return df 

all_enhancers_with_network_features = get_enhancer_network_features()
all_enhancers_with_network_features

Unnamed: 0,enhancer,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,closeAveGene,closeVarGene,...,SG65_indegreeVarGene,SG68_outDegree,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue,delState
0,chr13-101408800-101409200,1,27.000000,0.000000,0.428024,0.000000,1.000000,0.000000,0.077996,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.851215,1,NotDeleted
1,chr18-45880000-45880200,1,49.000000,0.000000,0.218000,0.000000,2.000000,0.000000,0.000185,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.012755,2,NotDeleted
2,chr5-174837800-174840000,1,41.000000,0.000000,0.886126,0.000000,1.000000,0.000000,0.000705,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.047393,0,NotDeleted
3,chr9-115430200-115432000,1,44.000000,0.000000,0.834423,0.000000,1.000000,0.000000,0.000166,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.107515,0,NotDeleted
4,chr11-134077600-134078200,7,34.714286,12.775510,0.453433,0.096335,1.571429,0.244898,0.028487,0.001201,...,0.0,7.0,4.571429,1.959184,0.0,0.0,0.0,0.009782,2,NotDeleted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246023,chr22-51047400-51049200,15,34.133333,9.182222,0.448805,0.061292,1.400000,0.240000,0.031838,0.001482,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.095386,1,NotDeleted
246024,chr5-55940400-55941400,2,39.500000,72.250000,0.542271,0.183098,1.500000,0.250000,0.030618,0.000930,...,0.0,1.0,19.000000,0.000000,0.0,0.0,0.0,0.018802,2,NotDeleted
246025,chr10-61223600-61224000,1,42.000000,0.000000,0.558700,0.000000,2.000000,0.000000,0.003260,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.154420,1,NotDeleted
246026,chr14-63648400-63648600,1,34.000000,0.000000,0.934630,0.000000,2.000000,0.000000,0.081406,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.002130,1,NotDeleted


## Khurana et al used hg19, not hg38

Khurana et al obtained enhancers from Cao et al [41], who used [human reference genome hg19](https://www.nature.com/articles/ng.3950#Sec9),
and obtained deletions from 1000 Genomes Phase 3 [43], who reported their results [relative to both hg19 and hg38](https://www.nature.com/articles/nature15394#accession-codes). 

To increase confidence that Khurana et al reported all coordinates relative to hg19, 
I noted that a sample of their low-LoF-tolerance enhancers had coordinates that were more similar to the hg19 than hg38 coordinates 
that I had previously obtained for VISTA enhancers. 


In [10]:
import sys
sys.path.insert(0, f'{CONSTRAINT_TOOLS}/utilities')  

In [11]:
from pack_unpack import pack

pack(*tuple(low_lof_tolerance_enhancers.iloc[0]))

'chr1:62053400-62055600'

In [12]:
pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/vista-elements/vista-elements.positive.hg38.hg19.tsv', sep='\t', header=None).iloc[33,3]

'chr1:62053434-62055908'

Finally, Duo Xu said "The paper was done on hg19" in a private communication. 

## Convert enhancer data to bed format (for later transformation to hg38)


In [16]:
def parse_row(row):
  chrom, start, end = row.split('-')
  return chrom, int(start), int(end)

def convert_to_bed(df):
  coordinates = (
    df['enhancer']
    .apply(parse_row)
    .apply(pd.Series)
    .rename(columns={0: 'chromosome', 1: 'start', 2: 'end'})
  )
  df = pd.concat([coordinates, df], axis=1)
  df = df.rename(columns={'enhancer': 'enhancer_hg19'})
  # df = df.drop('enhancer', axis=1, inplace=False)
  return df 

import subprocess
import os 
import json

def sort_and_save(df, filename_root): 
  path_root = f'{CONSTRAINT_TOOLS_DATA}/khurana/{filename_root}.hg19'
  df.to_csv(
    path_root + '.bed',
    sep='\t', 
    index=False, 
    header=False
  )
  subprocess.run(
    ["sort", "--version-sort", "-k1,1", "-k2,2n", "-k3,3n", path_root + '.bed'], 
    stdout=open(path_root + '.sorted.bed', "w")
  ) 
  with open(f'{path_root}.sorted.json', 'w') as f:
    json.dump(list(df.columns), f)  
  os.remove(path_root + '.bed') 
  subprocess.run(
    ["cut", "-f1-4", path_root + '.sorted.bed'], 
    stdout=open(path_root + '.sorted.coordinates-only.bed', "w")
  ) 

In [17]:
sort_and_save(
  df = convert_to_bed(enhancers_and_khurana_scores),
  filename_root = "enhancers-and-khurana-scores"
)

In [18]:
sort_and_save(
  df = lof_tolerant_enhancers,
  filename_root = "lof-tolerant-enhancers",
)

In [19]:
sort_and_save(
  df = low_lof_tolerance_enhancers,
  filename_root = "low-lof-tolerance-enhancers",
)

In [20]:
sort_and_save(
  df = convert_to_bed(disease_enhancers_and_khurana_scores),
  filename_root = "disease-enhancers-and-khurana-scores"
)

In [21]:
sort_and_save(
  df = convert_to_bed(pacbio_deleted_enhancers_and_khurana_scores),
  filename_root = "pacbio-deleted-enhancers-and-khurana-scores"
)

In [22]:
sort_and_save(
  df = convert_to_bed(all_enhancers_with_network_features),
  filename_root = "all-enhancers-with-network-features"
)