Log
- 20230816 
- 20231115 - revised  after Chengyu cleaned up data. 
- 20240520 - revised to standardize data transformations
- sarahfong

# objective 
learn the features of the US MPRA dataset

do some basic transformations and comparisons 
## clean up and label
- label cl.origin, control sequences
- label 'active': in ctrl, us if MEDIAN exceeds top 97.5 or bottom 2.5 shuffled PI
- label 'response': if ctrl != us active label and uncorrected pval<0.05
- label 'direction':
    - US_UP = US median is positive, CTRL median is negative
    - US_Down = US median is neg, CTRL median is pos
    - US_MORE_SILENT = US med is more neg than CTRL med
    - US_LESS_SILENT = US med is less neg than CTRL med
    - US_MORE_ACTIVE = US med is more pos than CTRL pos
    - US_LESS_ACTIVE = US med is less pos than CTRL pos

## transformations
- per replicate 
    
    - log2 normalize RNA/DNA
    - standard scale log2 RNA/DNA

- across replicates
    - compute median, mean, sd of log2 values, standard scaling values

    - delta
        - compute as the median/mean difference l2(US/control) or l2US - l2control

- Significance among tiles, across peaks
    -  per tile: Wilcoxon's t-test (also ran BH 5% FDR, but not any significant)
    -  per peak: Repeated measure t-test (also ran BH 5% FDR, but not any significant)

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os, sys

import pandas as pd

import seaborn as sns
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
from sklearn.preprocessing import RobustScaler, StandardScaler

import sys

# Read, write to config

In [None]:
LOCAL = False
if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    DATA_PATH = "/Users/sarahfong/Desktop/local_data/EMF/US"
else:
    DATA_PATH = "/wynton/group/ahituv/fongsl/projects/US/data"
import config_readwrite as crw
import plot_params as pp

pp.fonts()

In [None]:
# read
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

# path
#DATA_PATH = config["local_path"]["data"]

# make dictionary of values to write to config
config_dict = {
    "HEPG2": os.path.join(DATA_PATH, "full_hepg2_MPRA.csv"),
    "HEPG2.clean.transformed": os.path.join(DATA_PATH, "hepg2_MPRA.clean.transformed.tsv"),
    "HEPG2.clean.trans.peaks": os.path.join(DATA_PATH, "hepg2.MPRA.clean.transformed.peaks.tsv"),
    "HEPG2.clean.trans.scaled": os.path.join(DATA_PATH, "hepg2_MPRA.clean.transformed.standard.scaled.tsv"),
    "HEPG2.clean.trans.scaled.peaks":os.path.join(DATA_PATH, "hepg2.MPRA.clean.transformed.standard.scaled.peaks.tsv"),
    
    "BJ": os.path.join(DATA_PATH, "full_bj_MPRA.csv"),
    "BJ.clean.transformed": os.path.join(DATA_PATH, "bj_MPRA.clean.transformed.csv"),
    "BJ.clean.trans.peaks":os.path.join(DATA_PATH, "bj.MPRA.clean.transformed.peaks.tsv"),
    "BJ.clean.trans.scaled": os.path.join(DATA_PATH, "bj_MPRA.clean.transformed.standard.scaled.csv"),
    "BJ.clean.trans.scaled.peaks":os.path.join(DATA_PATH, "bj.MPRA.clean.transformed.standard.scaled.peaks.tsv"),

}

# make data section of config
section = "mpra"
crw.check(config, section)

# add dictionary to config
for key, value in config_dict.items():
    config[section][key] = value
    
# write to config    
crw.write(config, cfn)

# functions

In [None]:
# mean, median, std

def computeStats(df, l2_ratios_list):
    """ compute median, mean, std of ctrl and ultrasound replicates per sequence"""
    print(l2_ratios_list[:3])
    df["l2.ratio.med.ctrl"] = df[l2_ratios_list[:3]].median(axis=1)
    df["l2.ratio.mean.ctrl"] = df[l2_ratios_list[:3]].mean(axis=1)
    df["l2.ratio.std.ctrl"] = df[l2_ratios_list[:3]].std(axis=1)

    df["l2.ratio.med.us"] = df[l2_ratios_list[3:]].median(axis=1)
    df["l2.ratio.mean.us"] = df[l2_ratios_list[3:]].mean(axis=1)
    df["l2.ratio.std.us"] = df[l2_ratios_list[3:]].std(axis=1)

    return df

# l2 transform (RNA/DNA)

def log2Transform(df, ratios_list):
    """ log2 transform each ratio column"""

    for ratio in ratios_list:
        df[f"l2.{ratio}"] = np.log2(df[ratio])

    return df

 # compute fold change of medians


def computeDelta(df):
    """ compute delta of log2 median us - log2 median control"""

    df["delta.med"] = df["l2.ratio.med.us"]-df["l2.ratio.med.ctrl"]
    df["delta.mean"] = df["l2.ratio.mean.us"]-df["l2.ratio.mean.ctrl"]

    return df


def clOrigin(df):
    """annotate which cl a sequence was designed from"""

    
    df["cl.origin"] = df["name"].apply(lambda x: x.split("_")[0].lower())
    df.loc[df["cl.origin"] == "synthetic:", "cl.origin"] = "synthetic"

    
    return df


def ctrlAnnot(df, constants_list):
    """ annotate control type (pos, neg, test)"""

    df["type"] = "None"
    for ctrl in constants_list:
        df.loc[df['label'].str.contains(ctrl), "type"] = ctrl

    return df


def computePval(df, constants_list):
    """ compute per sequence ttest of ctrl v. ultrasound rep
        assume equal_var is False 
    """
    df["pval"] = None
    for i, row in df.iterrows():

        # get control replicates
        ctrls = row[constants_list[:3]]

        # get US replicates
        uss = row[constants_list[3:]]
        # t-test per sequence, no equal variance assumed.
        t, p = stats.ttest_ind(list(ctrls), list(uss), equal_var=False)

        # update dataframe
        df.at[i, 'pval'] = p

    return df


def callActive(df):
    """call active elements from ctrl, us MPRA as values > 95 of shuffled regions"""

    # get shuffles
    shufs = df.loc[df.name.str.contains("shuf")].copy()

    # get 97.5% of shuffle median score for ctrl, US treatment
    ctrl_975 = shufs['l2.ratio.med.ctrl'].quantile(0.975)
    us_975 = shufs['l2.ratio.med.us'].quantile(0.975)

    ctrl_025 = shufs['l2.ratio.med.ctrl'].quantile(0.025)
    us_025 = shufs['l2.ratio.med.us'].quantile(0.025)
    print(ctrl_975, ctrl_025, us_975, us_025)

    # create column to label active
    df['label.ctrl'], df['label.us'] = 0, 0

    # label active elements - has more activity than 97.5% shuffles
    df.loc[df['l2.ratio.med.ctrl'] > ctrl_975, 'label.ctrl'] = 1
    df.loc[df['l2.ratio.med.us'] > us_975, 'label.us'] = 1

    # label active elements - has less activity than 2.5% shuffles
    df.loc[df['l2.ratio.med.ctrl'] < ctrl_025, 'label.ctrl'] = -1
    df.loc[df['l2.ratio.med.us'] < us_025, 'label.us'] = -1

    return df


def callResponse(df):
    """label (1) response as  wilcoxon pval <0.05 and categorical ctrl v. US labels are not equal. 
        - No l2fc criteria (CD used this)
        lable (2) direction of activity
        
    """
    df["response"] = False

    df.loc[(df["pval"] < 0.05) &
           (df['label.ctrl'] != df['label.us']),
           "response"
           ] = True

    # describe direction of values
    df["direction"] = None

    # direction when US is positive and ctrl is negative
    df.loc[(df["l2.ratio.med.ctrl"] < 0)
           & (df["l2.ratio.med.us"] >= 0), "direction"] = "US_UP"

    # when US is negative and ctrl is positive
    df.loc[(df["l2.ratio.med.ctrl"] > 0)
           & (df["l2.ratio.med.us"] <= 0), "direction"] = "US_DOWN"

    # Less silent when US is negative and ctrl is negative and US > CTRL
    df.loc[(df["l2.ratio.med.ctrl"] < 0)
           & (df["l2.ratio.med.us"] <= 0)
           & (df["l2.ratio.med.us"] > df["l2.ratio.med.ctrl"]), "direction"] = "US_LESS_SILENT"

    # More silent when US is negative and ctrl is negative and US < CTRL
    df.loc[(df["l2.ratio.med.ctrl"] < 0)
           & (df["l2.ratio.med.us"] <= 0)
           & (df["l2.ratio.med.us"] < df["l2.ratio.med.ctrl"]), "direction"] = "US_MORE_SILENT"

    # Less active when US is positive and ctrl is positive and CTRL > US
    df.loc[(df["l2.ratio.med.ctrl"] > 0)
           & (df["l2.ratio.med.us"] >= 0)
           & (df["l2.ratio.med.us"] < df["l2.ratio.med.ctrl"]), "direction"] = "US_LESS_ACTIVE"
    # more active when US is positive and ctrl is positive and CTRL < US
    df.loc[(df["l2.ratio.med.ctrl"] > 0)
           & (df["l2.ratio.med.us"] >= 0)
           & (df["l2.ratio.med.us"] > df["l2.ratio.med.ctrl"]), "direction"] = "US_MORE_ACTIVE"

    return df

In [None]:
def fdrCorr(df, id_col, p_val_col):
    """FDR correction, alpha = 5% per cl, datatype hypothesis

        Subset hypothesis to test separately with FDR correction.
        
        cl.origin: HepG2, K562, BJ, HOB 
        Data type: H3K27ac, ATAC, Synthetics

        
    """
    fdrs = {}
    for origin in ["hepg2", "k562", "hob", "bj", None]:
        for datatype in ["Synthetic", 'H3K27ac', "ATAC"]:

            # subset dataframe by hypothesis
            test = df.loc[(df["cl.origin"] == origin) &
                          (df['type'] == datatype), [id_col, p_val_col]].copy().drop_duplicates()

            # perform fdr correction only if more than 30 observations
            if test.shape[0] > 30:
                print("fdr correction", origin, datatype)
                
                test["FDR_bool"], test[f'FDR_{p_val_col}'] = fdrcorrection(test[p_val_col])
                
                # compute -log10 p
                test["FDR_-log10p"] = np.log10(test[f'FDR_{p_val_col}'])*-1
                fdrs[f"{origin}.{datatype}"] = test #append results to dictionary

    
    return pd.merge(df, pd.concat(fdrs.values()), how="left")

## peak-wise function

In [None]:
def peakwiseStats(enh_id, df):
    """ collect and perform related t-test on peak tiles in US/CTRL exposures
        - treat all tiles in peaks as repeated measures in two different conditions - US and CTRL
    """
    cols = ["name",
            'l2.ratio.med.ctrl',
            'label.ctrl',
            'l2.ratio.1.ctrl',
            'l2.ratio.2.ctrl',
            'l2.ratio.3.ctrl',
            'l2.ratio.med.us',
            'label.us',
            'l2.ratio.1.us',
            'l2.ratio.2.us',
            'l2.ratio.3.us',
            'delta.med',
            "direction", "enh.coor", "enh.name"]

    # dget dataframe of enhancer only
    enh = df.loc[df["enh.name"] == enh_id, cols].copy()

    # treat all measurements as
    ctrls = list(pd.melt(enh,
                         value_vars=['l2.ratio.1.ctrl',
                                     'l2.ratio.2.ctrl',
                                     'l2.ratio.3.ctrl'])["value"])
    us = list(pd.melt(enh,
                      value_vars=['l2.ratio.1.us',
                                  'l2.ratio.2.us',
                                  'l2.ratio.3.us', ])["value"])
    ###
    # RELATIVE T-TEST - treat tiles like repeated measures
    ###
    
    # two related samples t-test, 
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
    s, p = stats.ttest_rel(ctrls, us)
    enh["peak_p"], enh["peak_stat"] = p, s

    return s, p, enh

# test CL 

## notes about this dataset
- Activity score - median score was assigned per US|CTRL for each sequence
- Active | inactive = test sequence score > 95% shuffled score. Both these scores are median score (above)
- logFC
- P-value

### constants dict

In [None]:
constants = {
    "NAMES": ['name',
              'label',
              'ratio.med.ctrl',
              'label.ctrl',
              'ratio.1.ctrl',
              'ratio.2.ctrl',
              'ratio.3.ctrl',
              'ratio.med.us',
              'label.us',
              'ratio.1.us',
              'ratio.2.us',
              'ratio.3.us',
              'pval',
              "p_adj",
              'logFC',
              'response'],

    "CTRL_LIST": ["Neg", "PosCt", "Shuffle", "DEG", "Non-diff", "Synthetic", 'H3K27ac', "ATAC"],

    "RATIOS": ['ratio.1.ctrl', 'ratio.2.ctrl', 'ratio.3.ctrl',
               'ratio.1.us', 'ratio.2.us', 'ratio.3.us'],

    "L2RATIOS": ['l2.ratio.1.ctrl', 'l2.ratio.2.ctrl', 'l2.ratio.3.ctrl',
                 'l2.ratio.1.us', 'l2.ratio.2.us', 'l2.ratio.3.us'],

    "L2STD": ['l2.ratio.std.ctrl', 'l2.ratio.std.us'],

    "CL_LIST": ["k562", "hob", "bj", "hepg2"],

    "ONOFF": ["US_UP", 'US_MORE_ACTIVE', "US_DOWN", "US_MORE_SILENT"]
}

## clean, transform data, compute summary stats

## peaks

In [None]:
## open peak annotation
PEAK_ANNOT = config["data"]["TILE_x_ENH"] #"/Users/sarahfong/Desktop/local_data/EMF/US/tilenames.x.enh.id.tsv"
peaks = pd.read_csv(PEAK_ANNOT, sep='\t')
peaks.head()

## process MPRA

In [None]:
### input data

CLS=["HEPG2", "BJ"] 

results = {} # collect all the results



In [None]:
for CL in CLS:
    results[CL] = {}  # make dictinoary of dictionaries for results
    cl_dict = results[CL]

    # read
    DATA = config_dict[CL]

    # write
    CLEAN = config_dict[f"{CL}.clean.transformed"]
    SCALED = config_dict[f'{CL}.clean.trans.scaled']
    CLEAN_PEAKS = config_dict[f"{CL}.clean.trans.peaks"]
    CLEAN_SCALED_PEAKS = config_dict[f"{CL}.clean.trans.scaled.peaks"]

    # load CD's data
    # .csv data, rename columns, skip row 1 (old column names)
    df_ = pd.read_csv(DATA, skiprows=1,
                      names=constants["NAMES"], low_memory=False)
    cl_dict["cd_df"] = df_  # add cd's dataframe to the results dictionary

    ###
    # clean up MPRA data - add annotations, l2 transform, get  replicate means, medians
    ###

    # annotate cell line origin
    df = clOrigin(df_)

    # annotate controls
    df = ctrlAnnot(df, constants["CTRL_LIST"])
    

    # log2 transform ratios - increase sensitivity for ratios <1
    df = log2Transform(df, constants["RATIOS"])

    # compute descriptive stats
    df = computeStats(df, constants["L2RATIOS"])

    # compute difference between scaled us v. control
    df = computeDelta(df)

    # recall active
    df = callActive(df)

    # re compute wilcoxon pvalues
    df = computePval(df, constants["L2RATIOS"])

    # label response based on label.ctrl != label.us, wilcoxon p-val, and l2fc >1
    df = callResponse(df)

    # format pvalues
    df["pval"] = df["pval"].astype(float)
    
    # fdr correction - per hypothesis
    
    df = fdrCorr(df, id_col="name", p_val_col="pval")


    # write the file
    df.to_csv(CLEAN, sep='\t', index=False)
    print('wrote clean')

    cl_dict["clean_df"] = df  # add cd's dataframe to the results dictionary

    ###
    # Standard scle replicates
    ###

    ratios = constants["L2RATIOS"]  # load ratios
    X = df[ratios]

    transformer = StandardScaler()

    t = pd.DataFrame(transformer.fit_transform(X[ratios]))

    t.columns = ratios  # rename columns

    # mean, median of standardized values

    t = computeStats(t, ratios)

    # compute delta
    t = computeDelta(t)

    # compute p value on standardized scale
    t = computePval(t, ratios)

    # add back names
    t = pd.merge(df['name'], t, left_index=True, right_index=True)

    # recall active
    t = callActive(t)

    # recall response
    t = callResponse(t)

    # add type information back into transformed dataframe
    t = pd.merge(t, df[["name",
                        "type", "cl.origin"]])

    # format p values
    t["pval"] = t["pval"].astype(float)

    # fdr correction - per hypothesis
    t = fdrCorr(t, id_col="name", p_val_col="pval")
  
    # write
    t.to_csv(SCALED, sep='\t', index=False)

    cl_dict["scaled_df"] = t  # add cd's dataframe to the results dictionary

    ###
    # Peakwise analysis
    ###

    # add enhancer information to dataframes
    t = pd.merge(t, peaks[["name", "enh.coor", "enh.name"]], how="left")
    df = pd.merge(df, peaks[["name", "enh.coor", "enh.name"]], how="left")

    all_sig_peaks = {}  # requires sig peak, DOES NOT REQUIRE categorical differential activity

    for merged_df, peakfile in [(df, CLEAN_PEAKS), (t, CLEAN_SCALED_PEAKS)]:

        # per enhancer compute peaks
        for enh_id in peaks["enh.name"].unique():

            # perform repeated values t-test
            s, p, enh = peakwiseStats(enh_id, merged_df)

            # annotate candidate peaks - (1) significant (2) turns ON/OFF
            enh["candidate"] = False

            if p < 0.05:  # evaluate significant peaks only.

                # annotate elements where US turns enhancer activity ON, OFF
                enh.loc[enh["direction"].isin(
                    constants["ONOFF"]), "candidate"] = True

            all_sig_peaks[enh_id] = enh  # add peak enhancer dictionary

        sig_all = pd.concat(all_sig_peaks.values()).drop_duplicates()

        # fdr correction
        # keep only enh.name and p values, so no fdr inflation from multiple tiles in enhancer
        fdr_df = sig_all[["enh.name", "cl.origin", "type",  "peak_p"]].drop_duplicates().copy()
    
        # fdr correction - per hypothesis
        fdr_df = fdrCorr(fdr_df, id_col="enh.name", p_val_col="peak_p")
        
        # merge back fdr information
        sig_all = pd.merge(sig_all, fdr_df, how="left")

        # format log10
        sig_all["-log10p_peak"] = -1*np.log10(sig_all["FDR_p"])

        sig_all.to_csv(peakfile, sep='\t', index=False)  # all tiles

        print(sig_all.groupby(["candidate", "FDR_bool"])["direction"].count())

In [None]:
sig_all.shape

In [None]:
sig_all = pd.concat(all_sig_peaks.values()).drop_duplicates()

In [None]:
# keep only enh.name and p values, so no fdr inflation from multiple tiles in enhancer
sig_all = pd.merge(sig_all, df[["name", "cl.origin", "type"]], how="left").drop_duplicates()
fdr_df = sig_all[["enh.name", "cl.origin", "type",  "peak_p"]].drop_duplicates().copy()

# fdr correction - per hypothesis
fdr_df = fdrCorr(fdr_df, id_col="enh.name", p_val_col="peak_p")

# merge back fdr information
sig_all = pd.merge(sig_all, fdr_df, how="left").drop_duplicates()


sig_all.to_csv(peakfile, sep='\t', index=False)  # all tiles

print(sig_all.groupby(["candidate", "FDR_bool"])["direction"].count())

In [None]:
fdr_df.groupby("enh.name")["cl.origin"].count().reset_index().sort_values(by="cl.origin", ascending=False).head(20)

In [None]:
checkCoor(df_, df, "pval")  # calling activity in control

In [None]:
df.loc[df["enh.name"]=="enh.298"].drop_duplicates()