In [None]:
#!/usr/bin/env python
# coding: utf-8

# 20221129
# 
# sarahfong
# 
# ### intersect nullomers, empirical shuffle with phylop 100way bigWig
# 
# split by exonic/non-exonic
# 
# 
# use bigWigSummary executable from UCSC to get phylop 
# 
# 
# compare nullomers v. empirical background

# In[ ]:
—

In [1]:
import glob
from joblib import Parallel, delayed
import os
import pybedtools as pbt
import subprocess
import sys


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import statsmodels as sm

# append path
sys.path.append("/wynton/home/ahituv/fongsl/tools/py_/")

# import config reader
import config_readwrite as crw
import zippery



config_tag = "config-exon.ini"

# append path
sys.path.append("/wynton/home/ahituv/fongsl/tools/py_/")

# import config reader
import config_readwrite as crw
import count_lines as cl
import plot_params as pp

config_name = os.path.join(os.getcwd(), config_tag)

config, configname = crw.read_config(config_name)


# Analysis

In [2]:
DATASET = "PHYLOP"

DATA_PATH = config[DATASET]["PATH"]
RE_PATH = config["RESULTS"]["PATH"] 

ANNOT = config["GENCODE"]["ANNOT"] 

RE = os.path.join(RE_PATH, DATASET)

if os.path.exists(RE) is False:
    os.mkdir(RE)
    config["RESULTS"][DATASET] = RE 

## functions

In [3]:
def plot_lines(df, out_pdf,datatype, xrange):
    fig, ax = plt.subplots(figsize=(6,6))
    sns.lineplot(df, x="pos", y= '50%', hue = "label" )
    sns.lineplot(df, x="pos", y= '25%', hue = "label")
    sns.lineplot(df, x="pos", y= '75%', hue = "label")

    ax.axvline(499, color="grey", ls = "--") # center line
    
    ax.set(xlim = (xrange[0], xrange[1]),
           ylabel = "phylop score - 100way",
            title = f"{datatype}:{xrange[0]}-{xrange[1]}")
    
    ax.legend(loc="upper right")
    
    plt.savefig(out_pdf, bbox_inches="tight")
    plt.show()

    
def plot_line(y, df, out_pdf,datatype, xrange):
    fig, ax = plt.subplots(figsize=(6,6))
    sns.lineplot(df, x="pos", y=y, hue="label")
    
    ax.axvline(499, color="grey", ls = "--") # center line
    
    ax.set(xlim = (xrange[0], xrange[1]),
           ylabel = f"phylop score - 100way-{y}",
            title = f"{datatype}:{xrange[0]}-{xrange[1]}")
    
    ax.legend(loc="upper right")
    
    plt.savefig(out_pdf, bbox_inches="tight")
    plt.show()
    
    
def calculateEmpiricalP(obs, exp_sum_list):
    
    import datetime
    """
    return two lists
        (1) info - vector w/  
                n_obs, 
                median_exp, 
                std, 
                fold-change  # calculated from the median of expected shuffle 
                p_val
                
        (2) fold_changes- vector expected fold changes (to calculate confidence interval)
        
    input
        observed overlap count (int)
        list of expected overlap counts (list of ints)
    
    method
        1. get median of expected overlap counts
        2. get standard deviation of expected overlap counts
        3. center expected overlap counts at median
        4. Sum the number of centered expected counts greater than observed centered count
            This is two tailed because it evaluates both sides of the distribution (w/ abs value). 
        5. calculate fold change as observed/ median expected w/ pseudo count
        6. calculate fold change of each "obs"/ expected w/ pseudo count
        7. calculate the p-value as count of equal or more extreme values than observed value
        8. return list of empirical info + fold changes
        
        
    
    """
    #1
    mu = np.median(exp_sum_list)  # median of exp.dist
    
    #2
    sigma = np.std(exp_sum_list)  # std
    
    #3
    dist_from_mu = [exp - mu for exp in exp_sum_list] # center the distribution 
    
    #4
    p_sum = sum(1 for exp_dist in dist_from_mu if abs(exp_dist) >= abs(obs - mu)) # count values >= centered obs

    #5
    fold_change = (obs + 1.0) / (mu + 1.0) # fold change obs from median expected w pseudo count
    
    #6
    fold_changes = list((obs + 1.0) / (m + 1.0) for m in exp_sum_list) # fold change obs from /each exp w pseudo count
    
    #7
    p_val = (p_sum + 1.0) / (len(exp_sum_list) + 1.0)  # probability of observing obs-like value equal or more extreme in expected distribution
    
    #8
    info = [
            obs, 
            mu, 
            sigma, 
            fold_change, 
            p_val, 
            str(datetime.datetime.now())
            ]
    
    return info, fold_changes

def exp(nullo, shuf, summary_stat_name, pos):
    """
    return PER POSITION foldchange, empirical P between observed and expected (shuffled) data
    
    input
        nullo (df) - pandas dataframe of nullomer summary stats in long form (each row is one base in dist)
        shuf (df) - pandas dataframe of N shuffled summary stats in long form (each row is one base in dist)
        summary_stat_name (str) - summary_stat measurement name (this should be a column in the dataframes)
        pos (int) - relative position of the base to estimate fold change between. 
        
    method
        1. prepare position phylop vectors for obs, exp
        2. calculate empirical P
    
    
    """
    obs = nullo.loc[nullo["pos"]== pos, summary_stat_name].iloc[0]
    exp_list =  shuf.loc[shuf["pos"]== pos, summary_stat_name].to_list()
    
    info, fold_changes = calculateEmpiricalP(obs, exp_list)
    
    print(info)
    
    return info, fold_changes

In [4]:
def formatDf(datatype, query):
    """
    glob all query files for datatype and turn these data into a single dataframe
    split dataframe based on nullomer v shuffle
    
    input 
        datatype (str) - label for dataset (e.g. exon-overlpa, no exon-overlap)
        query (str) - fragment of str to glob files on. 
        
    method
        1. glob all files
        2. make a list to collect pandas dataframes
        3. assign LABEL - NULLOMER/SHUF  for each dataset. 
        4. assign ID - str of the file name
        5. open the file as a pandas dataframe, add label, id columns, append to collection list
        6. concatenate all dataframes together
        7. split dataframes on NULLOMER/SHUF label
        
    return 
        df (pd dataframe) - all data
        nullo (pd dataframe) - data for just nullomers
        shuf (pd dataframe) - data for just matched shuffles
        
    """
    #1
    file_list = glob.glob(os.path.join(DATA_PATH, query))
    print(len(file_list))
    
    #2
    df_list = []
    
    #3
    for f in file_list:
    
        # assign static label
        if "shuf" in f:
            LABEL = f"SHUF-{datatype}"
        else:
            LABEL = datatype
        #4
        ID = f.split("/")[-1]
        
        #5
        df = pd.read_csv(f, sep='\t')
        
        df["label"], df["id"] = LABEL, ID

        df_list.append(df)    
    #6
    df = pd.concat(df_list)
    
    #7
    nullo = df.loc[df["label"] == datatype]
    shuf = df.loc[df["label"] != datatype]

    return df, nullo, shuf

In [5]:
def center_stats(nullo, shuf):
    """
    print obs v. exp at center position stats w/ median values
    """
    
    # MIDPOINT - median summary stats 
    i, f = exp(nullo, shuf, "50%", 500)

    # MIDPOINT-100 - median summary stats 
    i, f = exp(nullo, shuf, "50%", 400)

    # MIDPOINT+100 - median summary stats 
    i, f = exp(nullo, shuf, "50%", 600)

In [6]:
SETS = [('EXON-OVERLAP', "*-exon_overlap*SUMMARY_STATS.txt"), 
       ("EXON-NO-OVERLAP", "*-exon_no-overlap*SUMMARY_STATS.txt")]

lines = []  # for collecting stats
for DATATYPE, QUERY in SETS:
    print(DATATYPE)
    df, nullo, shuf = formatDf(DATATYPE, QUERY)
    break
    ## plot each percentile + xranges 

    ys = ["25%", "50%", "75%"]
    ranges = [("1kb", [0,1000]), ("0.2kb", [400, 600]),  ("0.1kb", [450,550])]
    
    for y in ys:
        for name, xrange in ranges:
            out = os.path.join(RE, f"phylop-{name}_{ANNOT}_{DATATYPE}-{y}.pdf")
            plot_line(y, df, out, DATATYPE, xrange)

    ## plot counts of phylop values

    fig, ax = plt.subplots(figsize=(6,6))
    sns.lineplot(df, x="pos", y= 'count', hue = "label")
    ax.set(title = DATATYPE)
    out = os.path.join(RE, f"{DATATYPE}-count.pdf")
    plt.savefig(out)

    ## plot everything together

    for name, xrange in ranges:
        out = os.path.join(RE, f"phylop-{name}_{ANNOT}_{DATATYPE}.pdf")

        plot_lines(df, out, DATATYPE, xrange)
        
    ### empirical P for median phylop
    # MIDPOINT - median summary stats 

    center_stats(nullo, shuf)
    for pos in np.arange(0,1000):
        """
        collect obs v exp stats
        """
        i, f = exp(nullo, shuf, "50%", pos)

        i.extend([pos,f"{ANNOT}-{DATATYPE}", "50%\n"])
        i = [str(item) for item in i]
        line = "\t".join(i)
        lines.append(line)


EXON-OVERLAP
501


# write all the stats to a file

In [33]:
 out_stat=os.path.join(RE, f"{DATASET}_{ANNOT}_empirical-stats.txt")

with open(out_stat, "w") as writer:
    for line in lines:
        writer.write(line)
    writer.close()