# objective 
20230816
sarahfong

learn the features of the US MPRA dataset

do some basic transformations and comparisons 

## transformations
- per replicate 
    
    - log2 normalize activity values
    - explore standard scaling log2 values (rep ctrl 2 has wider variance than other replicates) 

- across replicates
    - compute median, mean, sd of log2 values, standard scaling values 

## questions
- what is the variance between replicates? 
- How well do replicates correlate with one another?
    - pearson between replicates? 
    - spearman between replicates? 
- what is the correlation across replicate means, medians within group?

- is there are difference between log2 ratio and standard scaling activity? 

In [1]:
import config_readwrite as crw

import matplotlib.pyplot as plt
import numpy as np
import os, sys

import pandas as pd

import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection, multipletests
from sklearn.preprocessing import RobustScaler, StandardScaler

from plot_params import fonts

# Read, write to config

In [2]:
# read
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

# path
DATA_PATH = config["local_path"]["data"]

# make dictionary of values to write to config
config_dict = {
    "HEPG2": os.path.join(DATA_PATH, "full_hepg2_ultrasound_MPRA.csv"),
    "HEPG2.clean": os.path.join(DATA_PATH, "full_hepg2_ultrasound_MPRA.clean.csv"),
    "HEPG2.clean.transformed": os.path.join(DATA_PATH, "full_hepg2_ultrasound_MPRA.clean.transformed.csv"),
    "HEPG2.clean.trans.scaled": os.path.join(DATA_PATH, "full_hepg2_ultrasound_MPRA.clean.transformed.standard.scaled.csv"),
    "BJ": os.path.join(DATA_PATH, "full_bj_ultrasound_MPRA.csv"),
    "BJ.clean": os.path.join(DATA_PATH, "full_bj_ultrasound_MPRA.clean.csv"),
    "BJ.clean.transformed": os.path.join(DATA_PATH, "full_bj_ultrasound_MPRA.clean.transformed.csv"),
    "BJ.clean.trans.scaled": os.path.join(DATA_PATH, "full_bj_ultrasound_MPRA.clean.transformed.standard.scaled.csv"),
    "FASTA":os.path.join(DATA_PATH, "ultrasound_final_no_adapter.fasta")
}

# make data section of config
section = "data"
crw.check(config, section)

# add dictionary to config
for key, value in config_dict.items():
    config[section][key] = value
    
# write to config    
crw.write(config, cfn)

# functions

In [3]:
def cohensd(test, ctrl):
    cohens_d = (np.mean(test) - np.mean(ctrl)) / (np.sqrt((np.std(test) ** 2 + np.std(ctrl) ** 2) / 2))
    return cohens_d

## mean, median, std

In [4]:
def computeStats(df, l2_ratios_list):
    """ compute median, mean, std of ctrl and ultrasound replicates per sequence"""
    
    half_lst = int(len(l2_ratios_list)/2)
    
    print("ctrl cols:", l2_ratios_list[:half_lst])
    print("us cols:", l2_ratios_list[half_lst:])
    
    df["l2.ratio.med.ctrl"] = df[l2_ratios_list[:half_lst]].median(axis=1)
    df["l2.ratio.mean.ctrl"] = df[l2_ratios_list[:half_lst]].mean(axis=1)
    df["l2.ratio.std.ctrl"] = df[l2_ratios_list[:half_lst]].std(axis=1)

    df["l2.ratio.med.us"] = df[l2_ratios_list[half_lst:]].median(axis=1)
    df["l2.ratio.mean.us"] = df[l2_ratios_list[half_lst:]].mean(axis=1)
    df["l2.ratio.std.us"] = df[l2_ratios_list[half_lst:]].std(axis=1)
    
    return df

## l2 transform (RNA/DNA) 

In [5]:
def log2Transform(df, ratios_list):
    """ log2 transform each ratio column"""
    
    for ratio in ratios_list:
        df[f"l2.{ratio}"] = np.log2(df[ratio])
    
    return df

## get coordinates

In [6]:
def getCoordinates(df):
    """str split name to get genomic coordinates for endogenous sequences"""
    
    coordf = df.loc[df['name'].str.contains("chr")].copy()

    coordf["coor"] = coordf["name"].apply(
        lambda x: "chr"+x.split("chr")[1] if "chr" in x else None)

    # merge- add coordinates back to dataframe
    return pd.merge(df, coordf, how="left")

 ## compute fold change of medians 

In [7]:
def computeDelta(df):
    """ compute delta of log2 median us - log2 median control"""
    
    df["delta.med"] = df["l2.ratio.med.us"]-df["l2.ratio.med.ctrl"]
    df["delta.mean"] = df["l2.ratio.mean.us"]-df["l2.ratio.mean.ctrl"]
    
    return df

In [8]:
def clOrigin(df, constants_list):
    """annotate which cl a sequence was designed from"""
    
    df["cl.origin"] = None

    for cl in constants_list:
        df.loc[df['name'].str.contains(cl), "cl.origin"] = cl
    
    return df

In [9]:
def ctrlAnnot(df, constants_list):
    """ annotate control type (pos, neg, test)"""
    
    df["type"] = "None"
    for ctrl in constants_list:
        df.loc[df['label'].str.contains(ctrl), "type"] = ctrl
    
    return df

## significance testing

In [10]:
def computeTTestPval(df, constants_list):
    """ compute per sequence ttest of ctrl v. ultrasound rep
        assume equal_var is False 
        
        perform FDR correction per assay type - 
        'DEG', 'atac', 'k27ac', 'neg', 'pos', 'shuffle', 'synthetic'
    """
    
    df["pval"] = None
    for i, row in df.iterrows():
        half_lst = int(len(constants_list)/2)
        # get control replicates
        ctrls = row[constants_list[:half_lst]]

         # get US replicates
        uss = row[constants_list[half_lst:]]
        
        # t-test per sequence, no equal variance assumed. 
        t,p = stats.ttest_ind(list(ctrls), list(uss), equal_var=False)
        
        # update dataframe
        df.at[i,'pval'] = p
    
    return df

def computeRepeatedMeasurePval(df, constants_list, peak_dict):
    """compute related T-Test for related measures of US v. control pvalue for all tiles tested in enhancer
        controls for the independence of sequences. 
    """
    
    test = df[constants_list + ["name", "type"]]

    results ={}

    for ENH in peak_dict.keys():

        TILES = peak_dict[ENH]

        t = test.loc[test["name"].isin(TILES)].copy()

        if len(t)>0:

            tm = t.melt(id_vars=['name', "type"]) # melt data

            tm["group"] = tm["variable"].apply(lambda x: x.split(".")[-1]) # make us and ctrl groups

            s, t["pval.rep"] = stats.ttest_rel(tm.loc[tm["group"]=="ctrl", "value"], 
                              tm.loc[tm["group"] !="ctrl", "value"])
            t["enh"]=ENH
            results[ENH] = t

            del t, tm
            
    return pd.concat(results.values())

def fdrcorr(df, pvalcol, alpha):
    """perform FDR correction per type of dataset"""
    fdr_correction = {}

    # FDR pvalue correction within dataset
    for t in set(df["type"]):
        
        # enhancer-wise correction
        if "enh" in list(df):
            test = df.loc[(df["type"]==t), ["enh", pvalcol]].drop_duplicates().copy() 
            names = df.loc[df["type"]==t, ["enh", "name"]].drop_duplicates().copy()
        else:
            test=df.loc[df["type"]==t].copy()  # tile-wise correction

        fs, fp, fsidak, fb = multipletests(test[pvalcol], 
                                            alpha=alpha, method="fdr_bh"
                                            )

        bs, bp, sidak, bon = multipletests(test[pvalcol], 
                                            alpha=alpha, method="bonferroni"
                                            )

        a_str = "".join(str(alpha).split("."))
        
        
        if "enh" in list(df):
            test[f"fdr.{a_str}.bool.rep"], test[f"fdr.{a_str}.pval.rep"]= fs, fp

            test[f"bonf.{a_str}.bool.rep"], test[f"bonf.{a_str}.pval.rep"] = bs, bp
            fdr_correction[t] = pd.merge(names, test, how="left")
            del test, names
            
        else:
            test[f"fdr.{a_str}.bool"], test[f"fdr.{a_str}.pval"]= fs, fp
            test[f"bonf.{a_str}.bool"], test[f"bonf.{a_str}.pval"] = bs, bp
            
            fdr_correction[t] = test
            del test
        
        
    return pd.concat(fdr_correction.values())

In [11]:
def bootstrap(data_list, size, stat):  
    
    """
    return the discrete and relative 95% confidence intervals of a data_list 
    
    input
        list of data (list, continuous values) - any list of data values, must be continuous values. 
        size (int) - size of dataset to bootstrap from list. If None, make bootstrapped distribution from entire list
        stat (float or "mean") - quantile to bootstrap (float, 0-1) or mean (str)
        
    method 
        1. If size is None, get the length of the list 
        2. get the observed stat of the list (mean, median, quantile)
        3. set bootstrap parameters
        4. per iteration, randomly choose elements from the fold changes list w replacement
        5. append the stat to the list of bootstrapped_stat
        6. turn stats into a dataframe
        7. calculate the delta distances from the population stat. This centers the data.
        8. sort from largest to smallest difference
        9. get discrete 0.025 adn 0.975 quantile values of the centered stat distribution. 
        10. calculate relative confidence intervals and actual confidence interval values (population stat - quantile values)
    """
    
    #1
    if size is None:    
        size = len(data_list) # size of distribution to bootstrap

    #2
    if type(stat) is float:
        obs_stat = np.quantile(data_list, stat) # get observed stat
        
    elif stat=="mean":
        obs_stat = np.mean(data_list) # get observed stat
    
    #3
    nboot = 10000 # resample 10000 times
    val = 0
    bs_stats = []
    
    #4
    while val < nboot:

        bs_dist = np.random.choice(data_list, replace = True, size = size)
        
        #5
        if type(stat) is float:
            bs_stat = np.quantile(bs_dist, stat)
        elif stat=="mean":
            bs_stat = np.mean(bs_dist)
            
        bs_stats.append(bs_stat)
        val +=1
    #6
    bs = pd.DataFrame(data = bs_stats, 
                      index = np.arange(nboot), 
                      columns = ["bs_stat"]) # make dataframe of bootstraps

    #7 center the stat distribution
    bs["deltas"] = bs["bs_stat"] - obs_stat

    #8
    bs = bs.sort_values(by = "deltas", ascending= False)
    
    #9  get discrete 95th CI
    low = bs.deltas.quantile(0.025) 
    high = bs.deltas.quantile(0.975)
    ci_relative = [high, low]  # assume obs value is centered at zero

    #10  return ci relative to observed stat 
    ci_discrete = obs_stat - [high, low]  # assume obs value is center
   
    print(f"measure CI of {stat} quantile| mean estimate\n observed {stat} value:", 
          obs_stat,  
          "\ndiscrete diff from observed:", ci_discrete, 
          "\nrelative diff from observed:", ci_relative)
    return ci_discrete, ci_relative

In [12]:
def callActive(df):
    """call active elements from ctrl, us MPRA as values > 95 of shuffled regions"""
    
    # get shuffles
    shufs = df.loc[df.name.str.contains("shuf")].copy()
    
    # get 97.5% of shuffle median score for ctrl, US treatment
    ctrl_975 = shufs['l2.ratio.med.ctrl'].quantile(0.975)
    us_975 = shufs['l2.ratio.med.us'].quantile(0.975)
    
    
    ctrl_025 = shufs['l2.ratio.med.ctrl'].quantile(0.025)
    us_025 = shufs['l2.ratio.med.us'].quantile(0.025)
    print(ctrl_975,ctrl_025, us_975, us_025)

    # create column to label active
    df['label.ctrl'], df['label.us']=False, False
    
    # label active elements - has more activity than 97.5% shuffles
    df.loc[df['l2.ratio.med.ctrl']>ctrl_975, 'label.ctrl']=True
    df.loc[df['l2.ratio.med.us']>us_975, 'label.us']=True

    # label active elements - has less activity than 2.5% shuffles
    df.loc[df['l2.ratio.med.ctrl']<ctrl_025, 'label.ctrl']=True
    df.loc[df['l2.ratio.med.us']<us_025, 'label.us']=True
    
    return df

In [13]:
def callResponse(df):
    df["response"] = False
    
    df.loc[(df["pval"]<0.005)&
          (df['label.ctrl']!=df['label.us']),
           "response"
          ] = True
    
    return df

# get enhancer info

In [14]:
PEAKS = os.path.join(DATA_PATH, "tiles.x.atac_k27ac_diff.bed")
p = pd.read_csv(PEAKS, sep='\t')

enh = {}
for row in p.iterrows():
    enh_id, name = row[1]["enh_id"], row[1]["name"]
    if enh_id not in enh.keys():
        enh[enh_id] = [name]
        
    elif enh_id in enh.keys():
        name_lst = enh[enh_id]
        name_lst.append(name)
        enh[enh_id] = name_lst

# test cell line 

**notes about this dataset**
- Activity score - median score was assigned per US|CTRL for each sequence
- Active | inactive = test sequence score > 95% shuffled score. Both these scores are median score (above)
- logFC
- P-value

## Params

In [58]:

COMPUTE_TTEST = False
COMPUTE_REP_TTEST = True
COMPUTE_BS = True


### constants dict

In [59]:
def constantsDict(n_reps):
    if n_reps ==2:
        l2ratios = ['l2.ratio.1.ctrl',
                     #'l2.ratio.2.ctrl',
                     'l2.ratio.3.ctrl', 
                      'l2.ratio.1.us', 
                     #'l2.ratio.2.us',
                     'l2.ratio.3.us'
                    ]
    else:
        l2ratios = ['l2.ratio.1.ctrl',
                     'l2.ratio.2.ctrl',
                     'l2.ratio.3.ctrl', 
                      'l2.ratio.1.us', 
                     'l2.ratio.2.us',
                     'l2.ratio.3.us'
                    ]
    constants = {
        "NAMES" :['name',
             'label',
             'ratio.med.ctrl',
             'label.ctrl',
             'ratio.1.ctrl',
             'ratio.2.ctrl',
             'ratio.3.ctrl',
             'ratio.med.us',
             'label.us',
             'ratio.1.us',
             'ratio.2.us',
             'ratio.3.us',
             'pval',
             'logFC',
             'response'],

        "CTRL_LIST" : ["neg", "pos", "shuffle", "synthetic", 'k27ac', "atac", "DEG"],

        "RATIOS": ['ratio.1.ctrl','ratio.2.ctrl', 'ratio.3.ctrl', 
              'ratio.1.us', 'ratio.2.us','ratio.3.us'],

        "L2RATIOS": l2ratios,

        "CL_LIST" :["k562", "hob", "bj", "hepg2"]
    }
    return constants

### input data

multiple colinearities - do group-wise difference

In [63]:
#CL = "BJ" #"HEPG2"
results = {}
for CL in ["BJ", "HEPG2"]:
    DATA = config_dict[CL]
    CLEAN = config_dict[f"{CL}.clean.transformed"]
    SCALED =config_dict[f'{CL}.clean.trans.scaled']
    
    for N_REPS in [2,3]:
        constants = constantsDict(N_REPS)
    

        ## LOAD

        #if os.path.exists(CLEAN) is False:
        # load .csv data, rename columns, skip row 1 (old column names)
        df_ = pd.read_csv(DATA, skiprows=1, names=constants["NAMES"], low_memory=False)

        # annotate cell line origin
        df = clOrigin(df_, constants["CL_LIST"])

        # annotate controls
        df = ctrlAnnot(df,constants["CTRL_LIST"])

        # coordinates
        df = getCoordinates(df)

        # log2 transform ratios - increase sensitivity for ratios <1
        df = log2Transform(df, constants["RATIOS"])

        # compute stats
        df = computeStats(df, constants["L2RATIOS"])

        # compute difference between scaled
        df = computeDelta(df)

        # drop the ratio 2
        #df = df.drop(columns="l2.ratio.2.ctrl")

        df.head()

        ## compute T-test per tile


        if COMPUTE_TTEST is True:
            df = computeTTestPval(df, constants["L2RATIOS"]) # slow 1:43

            # fdr correction and bonferroni correction
            pvalcol="pval"
            alpha=0.1
            df = fdrcorr(df, pvalcol, alpha)

            keep_cols = ["name", 'label', 
                         'cl.origin',
                         'type',
                         'coor',
                         "pval", 
                         'fdr.01.bool',
                         'fdr.01.pval',
                         'bonf.01.bool',
                         'bonf.01.pval',
                         'delta.med',
                        ]

            keep = df[keep_cols].drop_duplicates()

            out = os.path.join(DATA_PATH, f"{CL}.sig.tiles.{N_REPS}rep.tsv")
            keep['CL'] = CL
            keep.sort_values(by='fdr.01.pval').to_csv(out, sep='\t', index=False)

        ## repeated measures

        if COMPUTE_REP_TTEST is True:
            
            out_filter = os.path.join(DATA_PATH, f"{CL}.sig.rep.enh.{N_REPS}rep.p.lessthan5.tsv")
            out = os.path.join(DATA_PATH, f"{CL}.sig.rep.enh.{N_REPS}rep.p.all.tsv")
            
            if os.path.exists(out) is False:
                # do this only for k27ac and atac. Remove any shuffled sequences from the stats
                rep_df = computeRepeatedMeasurePval(df.loc[df["type"].isin(["k27ac", "atac"])], 
                                                    constants["L2RATIOS"], enh) # 1minute, 32 seconds

                rep_df.head()

                ## FDR correction of repeated measures

                pvalcol="pval.rep"
                alpha=0.1
                rep_df = fdrcorr(rep_df, pvalcol, alpha)

                r = rep_df[["name", "enh", "pval.rep", "fdr.01.pval.rep"]].drop_duplicates()
                d = df[[ "name", "delta.med",    
                        'l2.ratio.med.ctrl', 'l2.ratio.med.us','label.ctrl','label.us']].drop_duplicates()
                
                dr = pd.merge(d,r,  how = "left")  # more tiles
                dr.to_csv(out, sep='\t', index=False)

                # filter for activity increases

                dr_filter = dr.loc[(dr["pval.rep"] < 0.05) &
                                   (dr["label.ctrl"]=="inactive")&
                                   (dr["label.us"]=="active")&
                                   (dr["delta.med"] > 1)
                                   ].sort_values(by=["delta.med", "l2.ratio.med.ctrl"], ascending=False)


                dr_filter['CL'] = CL
                dr_filter.to_csv(out_filter, sep='\t', index=False)
            else:
                dr_filter = pd.read_csv(out_filter, sep='\t')

        ## bootstrap the 95% of the control distribution

        if COMPUTE_BS is True:
            out = os.path.join(DATA_PATH, f"{CL}.sig.bs.95ci.{N_REPS}rep.tsv")
            if os.path.exists(out) is False:
                disc975, rel = bootstrap(df["delta.med"], None, 0.975)
                df["ci.975.bs"] = df["delta.med"].apply(lambda x: True if x>max(disc975) else False)
                disc025, rel = bootstrap(df["delta.med"], None, 0.025)
                df["ci.025.bs"] = df["delta.med"].apply(lambda x: True if x<min(disc025) else False)

                keep_cols = ["name", 'label', 
                             'cl.origin',
                             'type',
                             'coor',
                             'delta.med',
                             'ci.975.bs',
                             'ci.025.bs',
                             'label.ctrl','label.us'
                            ]

                keep = df[keep_cols].drop_duplicates()

                keep['CL'] = CL
                keep.loc[(keep['ci.975.bs'] == True)|
                         (keep['ci.025.bs'] == True)].to_csv(out, sep='\t', index=False)
            else:
                keep = pd.read_csv(out, sep='\t')
        results[f"{CL}-{N_REPS}"] = (keep, dr_filter, df)

ctrl cols: ['l2.ratio.1.ctrl', 'l2.ratio.3.ctrl']
us cols: ['l2.ratio.1.us', 'l2.ratio.3.us']
ctrl cols: ['l2.ratio.1.ctrl', 'l2.ratio.2.ctrl', 'l2.ratio.3.ctrl']
us cols: ['l2.ratio.1.us', 'l2.ratio.2.us', 'l2.ratio.3.us']
ctrl cols: ['l2.ratio.1.ctrl', 'l2.ratio.3.ctrl']
us cols: ['l2.ratio.1.us', 'l2.ratio.3.us']
ctrl cols: ['l2.ratio.1.ctrl', 'l2.ratio.2.ctrl', 'l2.ratio.3.ctrl']
us cols: ['l2.ratio.1.us', 'l2.ratio.2.us', 'l2.ratio.3.us']


In [64]:
h2k, h2d, h2f = results["HEPG2-2"]
h3k, h3d, h3f = results["HEPG2-3"]

In [66]:
print(len(set(h2d["name"])), len(set(h3d["name"])))
set(h2d["name"]).intersection(set(h3d["name"]))

15 21


{'hob_atac_up_chr14:24366690-24366959',
 'hob_k27ac_down_chr16:87951064-87951333',
 'hob_k27ac_down_chr1:81802226-81802495',
 'hob_k27ac_down_chr20:64062550-64062819',
 'hob_k27ac_down_chr5:322521-322790',
 'hob_k27ac_down_chr9:14322306-14322575'}

In [71]:

CL, N_REPS= "HEPG2", 3
h3k = pd.merge(h3k, df[["name", 'label.ctrl','label.us']])
h3k.to_csv(os.path.join(DATA_PATH, f"{CL}.sig.bs.95ci.{N_REPS}rep.tsv"), sep='\t', index=False)

In [73]:

CL, N_REPS= "HEPG2", 2
h2k = pd.merge(h2k, df[["name", 'label.ctrl','label.us']])
h2k.to_csv(os.path.join(DATA_PATH, f"{CL}.sig.bs.95ci.{N_REPS}rep.tsv"), sep='\t', index=False)

In [74]:
CL

'HEPG2'