In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp
import os, sys
import numpy as np
import pandas as pd

import config_readwrite as crw

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

import warnings
warnings.filterwarnings("ignore")
import plot_params as pp

pp.fonts()

('sans-serif', 'Arial', 18)

# config

In [2]:
# read
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

In [5]:
CL = "HEPG2"
DATAHEPG2 = config["mpra"][f'{CL}.clean.transformed'] 

CL = "BJ"
DATABJ = config["mpra"][f'{CL}.clean.transformed'] 

FASTA = config["mpra"]["fasta"]

MEME = config["meme"]["src"]
FIMO = config["fimo"]["src"]
RESULTS = config["path"]["results"]
BED = config["mpra"]["tile.bed"]
ls

RE = os.path.join(RESULTS, "jaspar")
RE_CL = os.path.join(RE, CL)

# make the results
for dirname in [RE, RE_CL]:
    if os.path.exists(dirname) is False:
        os.mkdir(dirname)

# append meme path
sys.path.append(MEME)

# change directory 
os.chdir(RE_CL)

In [6]:
# write

section = "JASPAR"
crw.check(config, section)

JASPAR_MOTIFS = "/wynton/group/ahituv/tfbs_motif/jaspar/JASPAR2022_CORE_non-redundant_pfms_meme.txt"
config[section]['meme_file']= JASPAR_MOTIFS
config[section][f'results'] = RE
config[section][CL] = RE_CL

crw.write(config, cfn)

# load data

In [5]:
def openAndFilter(file_name):
    df = pd.read_csv(file_name, sep='\t')  # (58333, 38)
    print("before filtering", file_name,  df.shape)
    df = df.loc[~df["label"].str.contains("Synthetic")].reset_index(
        drop=True)  # (26803, 38)
    print("filter out synthetics", df.shape)
    df = df.loc[~df["label"].str.contains("Pos")].reset_index(
        drop=True)  # (26659, 38)
    print("filter out positive controls", df.shape, '\n\n')

    return df

In [6]:
#hepg2 = openAndFilter(DATAHEPG2)
bj = openAndFilter(DATABJ)

before filtering /wynton/group/ahituv/fongsl/projects/US/data/full_bj_MPRA.clean.transformed.csv (58333, 38)
filter out synthetics (26803, 38)
filter out positive controls (26659, 38) 




In [7]:
bj.shape

(26659, 38)

In [8]:
enh = pd.read_csv(BED, sep='\t')
enh.head()

Unnamed: 0,name,enh.id,enh.name
0,hob_k27ac_down_chr1:826978-827247,chr1:826978-827704,enh.0
1,hob_k27ac_down_chr1:827110-827379,chr1:826978-827704,enh.0
2,hob_k27ac_down_chr1:827377-827646,chr1:826978-827704,enh.0
3,hob_k27ac_down_chr1:827435-827704,chr1:826978-827704,enh.0
4,hob_k27ac_down_chr1:827891-828160,chr1:827891-828160,enh.1


In [9]:
bj = pd.merge(bj, enh, how="left")

# add direction 

In [10]:
def labelDirection(df):

    """label sequences with different directions of change between US and Control
    note

    neg.US_UP == US MPRA activity is negative, but less negative than CTRL MPRA activity, which is also negative
    pos.US_UP  == US MPRA activity is positive and more positive than CTRL MPRA activity, which is also positive. 
    US_UP == US activity is positive and higher than CTRL MPRA activity, which is negative. 
    """
    df["direction"] = None

    # Pos values only
    df.loc[(df["l2.ratio.mean.us"] > 0) &
           (df["l2.ratio.mean.ctrl"] > 0) &
           (df["delta.mean"] > 0),
           "direction"] = "Pos.US_UP"

    df.loc[(df["l2.ratio.mean.us"] > 0) &
           (df["l2.ratio.mean.ctrl"] > 0) &
           (df["delta.mean"] < 0),
           "direction"] = "Pos.CTRL_UP"

    # Negtaive values only
    df.loc[(df["l2.ratio.mean.us"] < 0) &
           (df["l2.ratio.mean.ctrl"] < 0) &
           (df["delta.mean"] > 0),
           "direction"] = "Neg.CTRL_DOWN"

    df.loc[(df["l2.ratio.mean.us"] < 0) &
           (df["l2.ratio.mean.ctrl"] < 0) &
           (df["delta.mean"] < 0),
           "direction"] = "Neg.US_DOWN"

    # crossing zero values only
    df.loc[(df["l2.ratio.mean.us"] > 0) &
           (df["l2.ratio.mean.ctrl"] < 0) &
           (df["delta.mean"] > 0),
           "direction"] = "US_UP"

    df.loc[(df["l2.ratio.mean.us"] < 0) &
           (df["l2.ratio.mean.ctrl"] > 0) &
           (df["delta.mean"] < 0),
           "direction"] = "CTRL_UP"

    print(df.loc[df["direction"] != None, ["l2.ratio.mean.ctrl",
          "l2.ratio.mean.us", 'delta.mean', "direction"]].sample(n=20))
    return df


In [11]:
bj = labelDirection(bj)

       l2.ratio.mean.ctrl  l2.ratio.mean.us  delta.mean      direction
10863            0.234363          0.306728    0.072364      Pos.US_UP
1061             0.184807          0.284308    0.099501      Pos.US_UP
19748           -0.843745         -0.874760   -0.031015    Neg.US_DOWN
18972            0.007036         -0.003327   -0.010363        CTRL_UP
13732            0.352636          0.376663    0.024027      Pos.US_UP
22758            0.353077          0.332844   -0.020233    Pos.CTRL_UP
13457            0.067166          0.200141    0.132976      Pos.US_UP
24522           -0.401542         -0.415592   -0.014050    Neg.US_DOWN
26020            0.415838          0.412063   -0.003775    Pos.CTRL_UP
19299           -0.281141         -0.263704    0.017436  Neg.CTRL_DOWN
17194            0.352811          0.376170    0.023359      Pos.US_UP
9865             0.182454          0.265320    0.082866      Pos.US_UP
7312             0.421308          0.455944    0.034636      Pos.US_UP
18214 

In [12]:
set(bj["label"])

{'Differential_ATAC',
 'Differential_H3K27ac',
 'NegCtl_Ilias_MPRA',
 'NegCtl_Vikram_MPRA',
 'Non-differential',
 'Promoter_DEG',
 'Shuffle'}

In [None]:
# 
#fig,ax = plt.subplots(figsize=(6,6))
hue_order = ['US_UP',
             'CTRL_UP',
             'Neg.US_DOWN',
             'Neg.CTRL_DOWN',
             'Pos.US_UP',
             'Pos.CTRL_UP',
             ]

g = sns.jointplot(x="l2.ratio.mean.ctrl", y="l2.ratio.mean.us",
                  data=bj, hue="direction", hue_order=hue_order,
                  palette='tab20'
                 )
g.ax_joint.legend(frameon=False, bbox_to_anchor=(2, 1))

# US up - BJ

## functions

In [None]:
def getDfDirectionSub(df, direction):
    """subset dataframe by directino of effect"""

    cols = ["name", "label", "l2.ratio.mean.ctrl",
        'l2.ratio.mean.us', "delta.mean", "direction"]

    df = df.loc[df["direction"] == direction, cols].sort_values(
        by='l2.ratio.mean.us', ascending=False).copy()

    return df


def getQuantileDelta(df, top_quantile, base_df, shuf_bkgd=False):
    """subset dataframe by delta.mean top/bottom quantiles, return dataframes, list of names"""

    # top quantile of delta.means
    pos = df.loc[df["delta.mean"] >=
                     df["delta.mean"].quantile(top_quantile)].copy()

    # pos list of names
    pos_names = list(pos["name"])

    # bottom quantile of delta.means.
    neg = df.loc[df["delta.mean"] <=
                     df["delta.mean"].quantile(1-top_quantile)].copy()
    if shuf_bkgd is True:
        shuf = base_df.loc[base_df['label'] ==
            "Shuffle"].copy().drop_duplicates()
        neg = pd.concat([neg, shuf])

    # neg list of names
    neg_names = list(neg["name"])

    return pos, pos_names, neg, neg_names


def makeQuantileFasta(pos_names, neg_names, direction, re, top_quantile, exp_name, fa):
    """write fastas for each quantile based on list of sequence names"""

    bottom_quantile = round(1-top_quantile, 1)

    # files to write
    FA_POS = os.path.join(re, f"{exp_name}.{direction}.pos.{top_quantile}.fa")
    FA_NEG = os.path.join(
        re, f"{exp_name}.{direction}.neg.{bottom_quantile}.fa")

    # write positive, negative names and sequences
    for names, fa_write in [(pos_names, FA_POS), (neg_names, FA_NEG)]:

        # filter and write fasta function
        filterWriteFasta(fa_write, fa, names)

    return FA_POS, FA_NEG


def filterWriteFasta(fa_write_file, source_fa, seq_id_list):
    """provide file to write, source fasta, and sequence list"""
    fa_writer = open(fa_write_file, "w")  # open fa file to write

    with open(source_fa, "r") as reader:  # open the full fa file
        for value in sfp(reader):
            seq_id, seq = value  # get sequence id, sequence
            if seq_id in seq_id_list:  # filter for sequences in pos/neg name list
                # write those sequences to fa
                fa_writer.write(f'>{seq_id}\n{seq}\n')
    fa_writer.close()

def getEnhSet(df, name_set):
    """return list of enhancer names and linked tiles to prevent data leakage"""
    enh_names = df.loc[df["name"].isin(name_set), "enh.name"].unique()
    linkedtiles = df.loc[df["enh.name"].isin(enh_names), "name"].unique()
    
    return enh_names, linkedtiles

def runFimo(fimo, meme_txt, fa, direction, quantile):

    OUTDIR = f"./fimo/{direction}.{quantile}"
    cmd = " ".join([fimo,
                    #"--skip-matched-sequence", 
                   "--best-site", 
                    "--oc",
                    OUTDIR,
                    meme_txt,
                    fa,

                    ])
    print(cmd)
    if os.path.exists(OUTDIR) is False:
        if os.path.exists("./fimo") is False:
            os.mkdir("./fimo")
        os.mkdir(OUTDIR)
        #os.system(cmd)

    return OUTDIR

## Experiment 1: FIMO Motif finding on top 10% of US-increasing sequences v. bottom 10% of US-increasing sequences + shuff + . 
    - Hyp - top 10% reflects true signal, bottom 10% is noise, the shuffles are noise, the repressed sequences are a different set of elements. Motifs enriched in the top 10% of US-sensitive elements not found in bottom 10% are likely important for US-dependent TF binding. 
    - Data Universe: BJ MPRA, all of the sequences where the contrl MPRA value is negative and US MPRA value is positive

In [None]:
EXP_NAME ="exp1"
DIRECTION = "US_UP"
NEG_DIRECTION="CTRL_UP"
df = bj
TOP_QUANTILE = 0.9
BOTTOM_QUANTILE = round((1-TOP_QUANTILE),1)

### US-increasing sequences only

In [None]:
### subset df by direction. 

df_dir = getDfDirectionSub(df, DIRECTION)
neg_df_dir = getDfDirectionSub(df, NEG_DIRECTION)

### top and bottom 10% of direction changes. 

# top quantile of delta.means
pos, pos_names, neg, neg_names = getQuantileDelta(df_dir, TOP_QUANTILE, df, True)
print(len(neg_names))
# top US_down elements to add to the background
neg_pos, neg_pos_names, neg_neg, neg_neg_names = getQuantileDelta(neg_df_dir, TOP_QUANTILE, df, True)

# add top_US_down names to the negative list. 
neg_names.extend(neg_neg_names)
neg = pd.concat([neg, neg_neg])
print(len(neg_names))
# make fasta files from quantiles. 
FA_POS, FA_NEG = makeQuantileFasta(pos_names, neg_names, DIRECTION, RE_CL, TOP_QUANTILE, EXP_NAME, FASTA)

# add to config
config[section][f'{CL}.{DIRECTION}.{TOP_QUANTILE}.fa'] = FA_POS
config[section][f'{CL}.{DIRECTION}.{BOTTOM_QUANTILE}.fa'] = FA_NEG

exp1up, exp1down = pos_names, neg_names

In [None]:
neg = neg.loc[~neg["name"].isna()]
neg.loc[neg["name"].str.contains("shuf")]

In [None]:
plot = pd.concat([pos, neg])
# fig,ax = plt.subplots(figsize=(6,6))
hue_order = ['US_UP',
             'CTRL_UP',
             'Neg.US_DOWN',
             'Neg.CTRL_DOWN',
             'Pos.US_UP',
             'Pos.CTRL_UP',
             ]

g = sns.jointplot(x="l2.ratio.mean.ctrl", y="l2.ratio.mean.us",
                  data=plot, hue="direction", hue_order=hue_order,
                  palette='tab20'
                 )
g.ax_joint.legend(frameon=False, bbox_to_anchor=(2, 1))

### Fimo on full dataset

        bottom 0.1 US_UP bkgd discovered motifs
        CAGRCTCTCS
        CCTGKRTGWGW
        GAGRAGVMAGCMK
        
        bottom0.1 US_UP and shuffle bkgd
        AGGAGGMAGC
        AGRSCCCAGC
        CARRCTCTCCT

In [None]:
#OUTDIR_ALL =  runFimo(FIMO, JASPAR_MOTIFS, FASTA, "ALL", 'ALL')

### fimo on top 90% 

In [None]:
if os.path.exists("./fimo") is False:
    os.mkdir("./fimo")

OUTDIR_TOP =  runFimo(FIMO, JASPAR_MOTIFS, FA_POS, DIRECTION, TOP_QUANTILE)

### fimo on bottom 10% + shuf + top 90% CTRL_UP

In [None]:
OUTDIR_BOTTOM =  runFimo(FIMO, JASPAR_MOTIFS, FA_NEG, DIRECTION, BOTTOM_QUANTILE)

In [None]:
bottom_pred = os.path.join(OUTDIR_BOTTOM, "fimo.tsv")
top_pred = os.path.join(OUTDIR_TOP, "fimo.tsv")

tdf = pd.read_csv(top_pred, sep='\t')
tdf["quantile"] = 'top0.9'
tdf["-log10q"]  = -1*np.log10(tdf["q-value"])
print(tdf.shape)


bdf = pd.read_csv(bottom_pred, sep='\t')
bdf["quantile"] = 'shuf'
bdf["-log10q"]  = -1*np.log10(bdf["q-value"])
print(bdf.shape)

In [None]:
# concatenate the top and bottom values
plot = pd.concat([tdf, bdf])

plot = plot.loc[~plot["motif_alt_id"].isna()]

### filter for significance

In [None]:
QTHRESH =0.01
sigt, sigb=tdf.loc[tdf["q-value"]<=QTHRESH].copy(), bdf.loc[bdf["q-value"]<=QTHRESH].copy()
len(set(sigt["motif_alt_id"])), len(set(sigb["motif_alt_id"]))

In [None]:
sigt

In [None]:
sigb[["sequence_name", 'motif_alt_id']].drop_duplicates().sort_values(by="sequence_name")#.iloc[0][0]

### motifs found in top 90% of US_UP

In [None]:
sigt.groupby(["motif_alt_id"])["motif_id"].count().reset_index().sort_values(by="motif_id", ascending=False)

### motifs found in the top 10% of US_UP

In [None]:
sigb.groupby(["motif_alt_id"])["motif_id"].count().reset_index().sort_values(by="motif_id", ascending=False)

## Shared motifs in top 90%and top 10% US_UP

In [None]:
# overlap between positive and negative sets
print(len(set(sigt["motif_alt_id"]).intersection(set(sigb["motif_alt_id"]))))
set(sigt["motif_alt_id"]).intersection(set(sigb["motif_alt_id"]))

## Unique motifs in top 90% US up

In [None]:
# motifs in positive, not in negative. 
us_up_motifs = set(sigt["motif_alt_id"]).difference(set(sigb["motif_alt_id"]))
print(len(us_up_motifs))
us_up_motifs

In [None]:
sigt.loc[sigt["motif_alt_id"].isin(us_up_motifs), ['sequence_name', "motif_alt_id"]].drop_duplicates().groupby("motif_alt_id")["sequence_name"].count().reset_index().sort_values(by="sequence_name", ascending=False)

## positive, us-sensitive TF motifs 
- from Yang 2023 - https://www.nature.com/articles/s42255-023-00804-z#Fig5

In [None]:
POSITIVE_CONTROL=["FOS", 'DUSP1', "NR4A1", "EGR1", 
                  "Plagl1" # not positive control, but found in BJ_U9
                 ]
motif_counts = sigt.groupby(["motif_alt_id"])["motif_id"].count().reset_index().sort_values(by="motif_id", ascending=False)
motif_counts.loc[motif_counts["motif_alt_id"].isin(POSITIVE_CONTROL)]

In [None]:
sigt.loc[sigt['motif_alt_id']=="Plagl1"]

In [None]:
sigb.loc[sigb['motif_alt_id']=="Plagl1"]

## Unique motifs in bottom 10%

In [None]:
# motifs in negative set not in positive set
len(set(sigb["motif_alt_id"]).difference(set(sigb["motif_alt_id"])))

# Luciferase data 

## load experiment data 

In [None]:
LUC="/wynton/group/ahituv/fongsl/projects/US/data/validation_luc/Candidate_ID_table_luc.txt"

luc = pd.read_csv(LUC, sep='\t')

luc_names = list(luc["insert"])  # list of tiles tested in luciferase


# successful results where luciferase direction matches 
success = ["BJ_D2", "BJ_D3", "BJ_D4", "BJ_D7", "BJ_D8", 'BJ_D9', "BJ_U9"]

# annotate dataframe with sucesses
luc["success"] = False
luc.loc[luc["Sample name"].isin(success), "success"] = True
#luc.loc[~luc["Sample name"].str.contains(CL), "success"] = "other-cl"

luc.rename(columns = {"insert":"sequence_name", 
                     "Sample name": 'label_id'}, inplace=True)
luc

## make fasta

In [None]:
LUC_FA = LUC.strip(".txt") + '.fa'
filterWriteFasta(LUC_FA, FASTA, luc_names)

## fimo

In [None]:
OUTDIR_LUC =  runFimo(FIMO, JASPAR_MOTIFS, LUC_FA, "Luc", "all")

In [None]:
pred = os.path.join(OUTDIR_LUC, "fimo.tsv")

tdf = pd.read_csv(pred, sep='\t')
tdf["quantile"] = 'LUC.matches'
tdf["-log10q"]  = -1*np.log10(tdf["q-value"])
print(tdf.shape)
tdf = tdf.loc[~tdf["motif_alt_id"].isna()]
tdf

In [None]:
# filter by q-vales
luc_fimo = pd.merge(luc[["label_id", "sequence_name", "success"]],
                    tdf.loc[tdf["q-value"]<=QTHRESH], how="right").sort_values(by='success', ascending=False)
luc_fimo['label_id_success'] = luc_fimo["label_id"] + ".success." + luc_fimo['success'].map(str)

luc_fimo.loc[luc_fimo['success']==True].sort_values(by='label_id')

# investigate luc elements with motifs q<=0.01

In [None]:
for ID in luc_fimo.sort_values(by="success", ascending=False)["label_id_success"].unique():
    print(ID)
    if "BJ" in ID:
        u9 = luc_fimo.loc[#(luc_fimo['success']==True)&
                         (luc_fimo["label_id_success"]==ID)].copy().sort_values(by="q-value").reset_index(drop=True)


        #PLOT_TOP=15  # plot the top X motifs (based on q-value, which is biased by motif len)
        v=0
        for n, row in u9.iterrows():
            X= [row["start"], row["stop"]]

            if row["strand"] == "-":  # put negative strand ranks underneath the position
                n = -n

            y=[n,n]

            v+=1

            #if n<PLOT_TOP:
            plt.plot(X, y, lw=10)
            plt.text(row["start"],n+0.01, row["motif_alt_id"])
        plt.axhline(0, ls= "--", color = "grey")
            
        plt.ylabel("rank")    
        plt.xlabel("pos")
        plt.title(row["label_id_success"])
        plt.xlim(0,270)
        plt.show()



In [None]:
data=luc_fimo.loc[luc_fimo["success"]==True].sort_values(by='success', ascending=False)
table = data.groupby(["motif_alt_id", "label_id", "success"])['-log10q'].max().reset_index(
).pivot(index="motif_alt_id", columns=[ "label_id",], values="-log10q")#.fillna(0)

In [None]:
fig, ax =plt.subplots(figsize=(6,6))
sns.heatmap(table, mask=table<np.log10(0.05)*-1,
            square=True, 
            cmap="bwr", 
            #annot=True, 
            #center = np.log10(0.05)*-1,
            cbar_kws={"label":"-log10q"},

           )