20221123

sarahfong

### intersect nullomers with phastCons

phastCons100way hg38 .bed file was downloaded from the UCSC genome table browser


1. count the number of overlapping nullomers. 
    Remove duplicate nullomers (i.e. where one position can have two nullomers)
2. calculate expectation and intersect w/ phastCons

### preprocessing
    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/0_download_format_gencode.ipynb

    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/0_format_mutation_file.ipynb

    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/1_separate_coding_non-coding.ipynb

In [1]:
import glob
import os
import pybedtools as pbt
import subprocess
import sys

In [2]:
# append path
sys.path.append("/wynton/home/ahituv/fongsl/tools/py_/")

# import config reader
import config_readwrite as crw
import count_lines as cl

config_name = os.path.join(os.path.dirname(os.getcwd()), "config")

config, configname = crw.read_config(config_name)

In [3]:
# select config variables

GENCODE = config["GENCODE"]["BED"]
FLAT_GENCODE = config["GENCODE"]["MERGED"]  # write

# nullomer intersections
GENCODE_OVERLAP = config["DATA"]["GENCODE_OVERLAP"]  
GENCODE_NOOVERLAP_REF = config["DATA"]["GENCODE_NOOVERLAP_REF"]


PHASTCONS = config["PHASTCONS"]["100WAY"]

In [4]:
def phastcons_intersection(phastcons_bed, test_bed):
    """
    intersect test bed w/ phastcons elements
    
    input
        phastcons_bed (str) - path to phastcons.bed file
        test_bed (str) - path to test.bed file
        
    method
        1. turn bed files into pybedtool objects
        2. intersect bed and phastcons files
        3. count number of overlaps w phastcons
        
    return
        test_int (pybedtool object) - intersected pybedtools object. 
        count (int) - count of lines in intersection
        
    """
    
    #1
    phast = pbt.BedTool(phastcons_bed)
    test = pbt.BedTool(test_bed)
    
    #2
    test_int = test.intersect(phast)
    
    count = sum(1 for line in test_int) 
    print(count)
    
    #3
    return test_int, count   # return pbt.object, count

In [5]:
def shuffle_cmd(bed_file, incl):
    
    script = "/wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py"
    bed_file = bed_file
    iters = 30
    genome_build='hg38'

    outdir = os.path.join(os.path.dirname(bed_file),  "shuffle")
    
    if incl is not None:
        incl = incl
        cmd = f"python {script} -b {bed_file} -i {iters} -g {genome_build} -incl {incl} -o {outdir}"
    else:
        cmd = f"python {script} -b {bed_file} -i {iters} -g {genome_build} -o {outdir}"
        
    print(cmd)
    #subprocess.call(cmd, shell=True)

# Main

## non-coding x phastcons

In [None]:
# non-coding x phastcons intersection    
test_int_noncoding = phastcons_intersection(PHASTCONS, GENCODE_NOOVERLAP_REF)

In [None]:
getUniqueBedIntervals(test_int_noncoding)

439/9423 non-coding loci are conserved (5%)

In [None]:
439/9423 

## coding x phastcons

In [None]:
# coding x phastcons intersection    
test_int_coding = phastcons_intersection(PHASTCONS, GENCODE_OVERLAP)

getUniqueBedIntervals(test_int_coding)

2891/19807 coding loci are conserved in phastCons (15%)

In [None]:
2891/19807

# expectation

## coding
- shuffle in coding background (GENCODE)
- FIRST, need to flatten GENCODE coordinates. 

### flatten genocode coordinates
- want to shuffle into non-redundant background. Reduces bias from genes with many exons (therefore many lines in the bed file)

In [None]:
if os.path.exists(FLAT_GENCODE) is False:
    gencode = pbt.BedTool(GENCODE)

    # merge all elements that overlap 1bp
    merged = gencode.merge(output=FLAT_GENCODE)
    
else:
    print("you flattened this already")


### shuffle coding nullomers into GENCODE background

# expectation 

In [7]:
def exp(bed, incl):
    
    shuffle_cmd(bed, incl)
    
    if incl is None:
        shufs = glob.glob("/wynton/home/ahituv/fongsl/nullomers/data/shuffle/shuf-mutationsGENCODE_no-overlap_ref-*.bed")
    else:
        shufs = glob.glob("/wynton/home/ahituv/fongsl/nullomers/data/shuffle/shuf-mutationsGENCODE_overlap-*.bed")
        
        
    exp = []
    
    for c in shufs:
        test_int_coding, count = phastcons_intersection(PHASTCONS, c)
        exp.append(c)
        
    return exp

In [10]:
exp_overlap = exp(GENCODE_OVERLAP, FLAT_GENCODE)

python /wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py -b /wynton/home/ahituv/fongsl/nullomers/data/mutationsGENCODE_overlap.bed -i 30 -g hg38 -incl /wynton/home/ahituv/fongsl/nullomers/data/Gencode/gencode.v42.basic.annotation_merged.bed -o /wynton/home/ahituv/fongsl/nullomers/data/shuffle
1304
1348
1302
1285
1261
1304
1286
1324
1255
1326
1273
1268
1252
1293
1257
1307
1331
1334
1322
1292
1312
1264
1276
1311
1356
1273
1248
1316
1330
1263
1322


In [13]:
exp_noOverlap = exp(GENCODE_NOOVERLAP_REF, None)

python /wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py -b /wynton/home/ahituv/fongsl/nullomers/data/mutationsGENCODE_no-overlap_ref.bed -i 30 -g hg38 -o /wynton/home/ahituv/fongsl/nullomers/data/shuffle
349
333
352
334
366
348
323
372
333
334
360
346
362
341
334
311
356
366
320
359
343
349
324
358
332
311
338
349
339
328
349
