## Probability of LoF tolerance for a set of enhancers 

In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
# Khurana paper: 
# https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1008663

import pandas as pd

# Read in the Excel file, which contains multiple sheets
khurana_table_S2 = pd.read_excel(f'{CONSTRAINT_TOOLS_DATA}/khurana/table-S2.xlsx', sheet_name=None)

In [3]:
khurana_table_S2.keys()

dict_keys(['SupplementaryTable3', 'LoF-tolerant enhancers', 'Low-LoF-tolerance enhancers', 'Feature importance'])

In [4]:
enhancers_and_khurana_scores = khurana_table_S2['SupplementaryTable3']
enhancers_and_khurana_scores

Unnamed: 0,enhancer,original,predict LoF-tolerant,prob LoF-tolerance
0,chr13-101408800-101409200,NotDeleted,Low-LoF-tolerance,0.248020
1,chr18-45880000-45880200,NotDeleted,LoF-tolerant,0.646635
2,chr5-174837800-174840000,NotDeleted,LoF-tolerant,0.881760
3,chr9-115430200-115432000,NotDeleted,LoF-tolerant,0.903623
4,chr11-134077600-134078200,NotDeleted,LoF-tolerant,0.729862
...,...,...,...,...
245088,chr22-51047400-51049200,NotDeleted,LoF-tolerant,0.592781
245089,chr5-55940400-55941400,NotDeleted,LoF-tolerant,0.647929
245090,chr10-61223600-61224000,NotDeleted,LoF-tolerant,0.516712
245091,chr14-63648400-63648600,NotDeleted,LoF-tolerant,0.665555


## Khurana et al used hg19, not hg38

Khurana et al obtained enhancers from Cao et al [41], who used [human reference genome hg19](https://www.nature.com/articles/ng.3950#Sec9),
and obtained deletions from 1000 Genomes Phase 3 [43], who reported their results [relative to both hg19 and hg38](https://www.nature.com/articles/nature15394#accession-codes). 

To increase confidence that Khurana et al reported all coordinates relative to hg19, 
I noted that a sample of their low-LoF-tolerance enhancers had coordinates that were more similar to the hg19 than hg38 coordinates 
that I had previously obtained for VISTA enhancers. 


In [5]:
import sys
sys.path.insert(0, f'{CONSTRAINT_TOOLS}/utilities')  

In [6]:
from pack_unpack import pack

pack(*tuple(khurana_table_S2['Low-LoF-tolerance enhancers'].iloc[0]))

'chr1:62053400-62055600'

In [7]:
pd.read_csv(f'{CONSTRAINT_TOOLS_DATA}/vista-enhancers/vista-enhancers.hg38.hg19.tsv', sep='\t', header=None).iloc[59,3]

'chr1:62053434-62055908'

## Convert enhancer data to bed format (for later transformation to hg38)


In [None]:
# TODO 

def convert_to_bed(): 
  for line in sys.stdin: 
    fields = line.split('|')
    region = fields[1].strip()
    enhancer_class = fields[3].strip()
    if enhancer_class == 'negative': continue 
    chromosome, start, end = unpack(region)
    print(f'{chromosome}\t{start}\t{end}')


In [None]:
# TODO 
# sort by chromosome and start position