20221123

sarahfong

### intersect nullomers with phastCons

phastCons100way hg38 .bed file was downloaded from the UCSC genome table browser


1. count the number of overlapping nullomers. 
    Remove duplicate nullomers (i.e. where one position can have two nullomers)
2. calculate expectation and intersect w/ phastCons

### preprocessing
    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/0_download_format_gencode.ipynb

    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/0_format_mutation_file.ipynb

    /wynton/home/ahituv/fongsl/nullomers/bin/GENCODE/1_separate_coding_non-coding.ipynb

In [1]:
import glob
from joblib import Parallel, delayed
import os
import pybedtools as pbt
import subprocess
import sys

import numpy as np

import seaborn as sns

import datetime

In [2]:
# append path
sys.path.append("/wynton/home/ahituv/fongsl/tools/py_/")

# import config reader
import config_readwrite as crw
import count_lines as cl

config_name = os.path.join(os.path.dirname(os.getcwd()), "config")

config, configname = crw.read_config(config_name)

In [3]:
# select config variables
ANNOT = "EXON"
GENCODE = config["GENCODE"][f"{ANNOT}_BED"]
FLAT_GENCODE = config[f"DATAx{ANNOT}"]["MERGED"]  # write

# nullomer intersections
OVERLAP = config[f"DATAx{ANNOT}"]["OVERLAP"]  
NOOVERLAP_REF = config[f"DATAx{ANNOT}"]["NOOVERLAP_REF"]


PHASTCONS = config["PHASTCONS"]["100WAY"]
EX_EXP = config["PHASTCONS"]["EXON_EXP"]  # write
NOEX_EXP = config["PHASTCONS"]["NOEXON_EXP"]  # write

In [4]:
def phastcons_intersection(phastcons_bed, test_bed):
    """
    intersect test bed w/ phastcons elements
    
    input
        phastcons_bed (str) - path to phastcons.bed file
        test_bed (str) - path to test.bed file
        
    method
        1. turn bed files into pybedtool objects
        2. intersect bed and phastcons files
        3. count number of overlaps w phastcons
        
    return
        test_int (pybedtool object) - intersected pybedtools object. 
        count (int) - count of lines in intersection
        
    """
    
    #1
    phast = pbt.BedTool(phastcons_bed)
    test = pbt.BedTool(test_bed)
    
    #2
    test_int = test.intersect(phast)
    
    count = sum(1 for line in test_int) 
    print(count)
    
    #3
    return count   # return pbt.object, count

In [5]:
def shuffle_cmd(bed_file, incl):
    """
    shuffle the bed file
    
    coding - shuffle inside the gencode exon background
    non-coding - shuffle in the rest of the genome 
    
    """
    
    script = "/wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py"
    bed_file = bed_file
    iters = 30
    genome_build='hg38'

    outdir = os.path.join(os.path.dirname(bed_file),  "shuffle")
    
    if incl is not None:
        incl = incl
        cmd = f"python {script} -b {bed_file} -i {iters} -g {genome_build} -incl {incl} -o {outdir}"
    else:
        cmd = f"python {script} -b {bed_file} -i {iters} -g {genome_build} -o {outdir}"
        
    print(cmd)
    #subprocess.call(cmd, shell=True)

In [9]:
def parallel_intersections(shuffle_list, phastcons):

    #num_cores = multiprocessing.cpu_count()
    num_cores = 16
    print("number of cores", num_cores)

    # run parallel jobs

    exp = Parallel(n_jobs=num_cores, verbose=100, prefer="threads")(delayed(phastcons_intersection)(phastcons, shuf_iter) for shuf_iter in shuffle_list)
    
    return exp

In [10]:
def exp(bed, incl, annot, phastcons):
    

    shuf_path = "/wynton/home/ahituv/fongsl/nullomers/data/shuffle/"
    
    if incl is None:
        
        shufs = glob.glob(os.path.join(shuf_path, f"shuf-mutations_uniq_lociGENCODE-{annot}_no-overlap-*.bed"))
    else:
        shufs = glob.glob(os.path.join(shuf_path,f"shuf-mutations_uniq_lociGENCODE-{annot}_overlap-*.bed"))
    print( "n shuffles to intersect", len(shufs))
        
    # parallel process here.    
    exp = parallel_intersections(shufs, phastcons)
   
    return exp

In [11]:
def write_expectation(outfile, exp_list):

    with open(outfile, "w") as results:
        
        for i in exp_list:
            line = f"{i}\n"
            results.write(line)
        
        results.close()

# Main

## non-coding x phastcons

In [6]:
# non-coding x phastcons intersection    
test_int_noncoding = phastcons_intersection(PHASTCONS, NOOVERLAP_REF)

print(sum(1 for line in open(NOOVERLAP_REF, "r")))

1178
24546


How many NON-EXON nullomer loci overlap phastcons element? 
    
    1178/24546 non-coding loci are conserved (4.8%) (ANY GENCODE EXON)
    
    OLD - 439/9423 non-coding loci are conserved (5%) (ANY GENCODE)

## coding x phastcons

In [7]:
# coding x phastcons intersection    
test_int_coding = phastcons_intersection(PHASTCONS, OVERLAP)
print(sum(1 for line in open(OVERLAP, "r")))

2150
4593


How many EXON nullomer loci overlap phastcons element? 

    2164/4624 exon loci are conserved in phastCons (46.8%) (ANY GENCODE EXON)
    
    OLD - 2891/19807 coding loci are conserved in phastCons (15%) (ANY GENCODE)

# empirical expectation

## coding
- shuffle in coding background (GENCODE)
- FIRST, need to flatten GENCODE coordinates. 

### flatten genocode coordinates
- want to shuffle into non-redundant background. Reduces bias from genes with many exons (therefore many lines in the bed file)

In [8]:
if os.path.exists(FLAT_GENCODE) is False:
    gencode = pbt.BedTool(GENCODE)

    # merge all elements that overlap 1bp
    merged = gencode.merge(output=FLAT_GENCODE)
    
else:
    print("you flattened this already")


you flattened this already


# expectation 

## exonic

### shuffle coding nullomers into GENCODE background

In [12]:
shuffle_cmd(OVERLAP, FLAT_GENCODE)
exp_Overlap = exp(OVERLAP, FLAT_GENCODE, ANNOT, PHASTCONS)
write_expectation(EX_EXP, exp_Overlap)

python /wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py -b /wynton/home/ahituv/fongsl/nullomers/data/mutations_uniq_lociGENCODE-EXON_overlap.bed -i 30 -g hg38 -incl /wynton/home/ahituv/fongsl/nullomers/data/Gencode/gencode.v42.basic.annotation-exons_merged.bed -o /wynton/home/ahituv/fongsl/nullomers/data/shuffle
n shuffles to intersect 101
number of cores 16
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
1469
[Parallel(n_jobs=16)]: Done   1 tasks      | elapsed:   22.3s
1499
[Parallel(n_jobs=16)]: Done   2 tasks      | elapsed:   22.4s
1548
[Parallel(n_jobs=16)]: Done   3 tasks      | elapsed:   22.5s
1488
[Parallel(n_jobs=16)]: Done   4 tasks      | elapsed:   22.7s
1473
[Parallel(n_jobs=16)]: Done   5 tasks      | elapsed:   22.7s
1505
[Parallel(n_jobs=16)]: Done   6 tasks      | elapsed:   22.9s
1518
[Parallel(n_jobs=16)]: Done   7 tasks      | elapsed:   23.0s
1495
[Parallel(n_jobs=16)]: Done   8 tasks      | elapsed:   23.1s
1490
[Parallel(

In [46]:
obs = 2150
results = calculateEmpiricalP(obs, exp_Overlap[:-1])

results

([2150,
  1502.0,
  34.21820568060225,
  1.4311377245508983,
  0.009900990099009901,
  '2022-11-29 22:44:46.539446'],
 [1.3832797427652732,
  1.4142011834319526,
  1.434,
  1.3886378308586185,
  1.4445936870382807,
  1.4885813148788927,
  1.4282868525896415,
  1.4885813148788927,
  1.451417004048583,
  1.4330446369087275,
  1.463265306122449,
  1.4592944369063772,
  1.4378342245989304,
  1.442655935613682,
  1.4160631994733377,
  1.4642614023144997,
  1.4132720105124836,
  1.4179301252471985,
  1.4814049586776858,
  1.442655935613682,
  1.3886378308586185,
  1.494788047255038,
  1.4216787838730998,
  1.4763212079615649,
  1.394941634241245,
  1.5041958041958041,
  1.3477443609022557,
  1.4311377245508983,
  1.4484848484848485,
  1.4226190476190477,
  1.4712722298221614,
  1.442655935613682,
  1.463265306122449,
  1.4927133934767522,
  1.4282868525896415,
  1.4407233757535165,
  1.3985695708712613,
  1.4397590361445782,
  1.4523970290344361,
  1.41699604743083,
  1.4553450608930987,
  1

## non-exonic

In [14]:
shuffle_cmd(NOOVERLAP_REF, None)
exp_noOverlap = exp(NOOVERLAP_REF, None, ANNOT, PHASTCONS)
write_expectation(NOEX_EXP, exp_noOverlap)

python /wynton/home/ahituv/fongsl/tools/genome/shuf_wynton.py -b /wynton/home/ahituv/fongsl/nullomers/data/mutations_uniq_lociGENCODE-EXON_no-overlap.bed -i 30 -g hg38 -o /wynton/home/ahituv/fongsl/nullomers/data/shuffle
n shuffles to intersect 100
number of cores 16
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
1073
[Parallel(n_jobs=16)]: Done   1 tasks      | elapsed:   23.1s
1054
[Parallel(n_jobs=16)]: Done   2 tasks      | elapsed:   23.2s
989
[Parallel(n_jobs=16)]: Done   3 tasks      | elapsed:   23.2s
990
[Parallel(n_jobs=16)]: Done   4 tasks      | elapsed:   23.3s
989
[Parallel(n_jobs=16)]: Done   5 tasks      | elapsed:   23.3s
1054
[Parallel(n_jobs=16)]: Done   6 tasks      | elapsed:   23.3s
1023
[Parallel(n_jobs=16)]: Done   7 tasks      | elapsed:   23.4s
1027
[Parallel(n_jobs=16)]: Done   8 tasks      | elapsed:   23.4s
1036
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   23.5s
959
[Parallel(n_jobs=16)]: Done  10 tasks      | e

In [37]:
def calculateEmpiricalP(obs, exp_sum_list):
    """
    return two lists
        (1) info - vector w/  
                n_obs, 
                median_exp, 
                std, 
                fold-change  # calculated from the median of expected shuffle 
                p_val
                
        (2) fold_changes- vector expected fold changes (to calculate confidence interval)
        
    input
        observed overlap count (int)
        list of expected overlap counts (list of ints)
    
    method
        1. get median of expected overlap counts
        2. get standard deviation of expected overlap counts
        3. center expected overlap counts at median
        4. Sum the number of centered expected counts greater than observed centered count
            This is two tailed because it evaluates both sides of the distribution (w/ abs value). 
        5. calculate fold change as observed/ median expected w/ pseudo count
        6. calculate fold change of each "obs"/ expected w/ pseudo count
        7. calculate the p-value as count of equal or more extreme values than observed value
        8. return list of empirical info + fold changes
        
        
    
    """
    #1
    mu = np.median(exp_sum_list)  # median of exp.dist
    
    #2
    sigma = np.std(exp_sum_list)  # std
    
    #3
    dist_from_mu = [exp - mu for exp in exp_sum_list] # center the distribution 
    
    #4
    p_sum = sum(1 for exp_dist in dist_from_mu if abs(exp_dist) >= abs(obs - mu)) # count values >= centered obs

    #5
    fold_change = (obs + 1.0) / (mu + 1.0) # fold change obs from median expected w pseudo count
    
    #6
    fold_changes = list((obs + 1.0) / (m + 1.0) for m in exp_sum_list) # fold change obs from /each exp w pseudo count
    
    #7
    p_val = (p_sum + 1.0) / (len(exp_sum_list) + 1.0)  # probability of observing obs-like value equal or more extreme in expected distribution
    
    #8
    info = [
            obs, 
            mu, 
            sigma, 
            fold_change, 
            p_val, 
            str(datetime.datetime.now())
            ]
    
    return info, fold_changes


In [48]:
obs = 1178
results = calculateEmpiricalP(obs, exp_noOverlap)

results

([1178,
  1023.5,
  32.152971868864626,
  1.150805270863836,
  0.009900990099009901,
  '2022-11-29 22:51:36.334687'],
 [1.0977653631284916,
  1.1513671875,
  1.190909090909091,
  1.190909090909091,
  1.1446601941747572,
  1.1446601941747572,
  1.1468871595330739,
  1.1502439024390243,
  1.228125,
  1.0967441860465117,
  1.1175355450236968,
  1.211716341212744,
  1.1175355450236968,
  1.1993896236012207,
  1.1369334619093538,
  1.18970736629667,
  1.1638696939782824,
  1.18970736629667,
  1.1282296650717702,
  1.1837349397590362,
  1.1684836471754212,
  1.1993896236012207,
  1.1239275500476644,
  1.1981707317073171,
  1.114366729678639,
  1.1921132457027301,
  1.1413359148112294,
  1.114366729678639,
  1.1604330708661417,
  1.1638696939782824,
  1.21671826625387,
  1.1604330708661417,
  1.2006109979633401,
  1.1358381502890174,
  1.18970736629667,
  1.1627218934911243,
  1.2230290456431536,
  1.1570166830225712,
  1.2384453781512605,
  1.1260744985673352,
  1.1336538461538461,
  1.13913