# 20240313
sarah fong

parse the synthetic motifs fasta file. 

In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp
from Bio.Seq import reverse_complement
import numpy as np
import os, sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import config_readwrite as crw

In [28]:
LOCAL = False

if LOCAL is True:
    DATA_PATH = "/Users/sarahfong/Desktop/local_data/US/MPRA"
    RE = "/Users/sarahfong/Desktop/local_data/US/results/mpra/eda/synthetics"
    DESIGN = "/Users/sarahfong/Desktop/local_data/US/results/CD_candidates/motifs_synthetic_noms.csv"
    sys.path.append("/Users/sarahfong/tools/py_/")

else:
    DATA_PATH = '/wynton/group/ahituv/fongsl/projects/US/data/'  # need to check
    RE = "/wynton/group/ahituv/fongsl/projects/US/results/mpra/synthetics"
    DESIGN = "/wynton/group/ahituv/fongsl/projects/US/results/CD_candidates/motifs_synthetic_noms.csv"


os.chdir(DATA_PATH)
if os.path.exists(RE) is False:
    os.mkdir(RE)
    
CLS = ["hepg2", "bj"]
FA = "./ultrasound_final_no_adapter.fa"
CLEAN_SYN_LIB = "./clean_syn_mpra_lib.tsv"
CLEAN_DESIGN = "./clean_motifs_synthetic_noms.tsv"

HEPG2_MPRA_SYN = f"./{CLS[0]}_MPRA.clean.transformed.standard.scaled.synthetics.only.tsv"
HEPG2_MPRA_SYN_BS = f"./{CLS[0]}.pos.exp.bs.tsv"

BJ_MPRA_SYN = f"./{CLS[1]}_MPRA.clean.transformed.standard.scaled.synthetics.only.tsv"
BJ_MPRA_SYN_BS = f"./{CLS[1]}.pos.exp.bs.tsv"

import bootstrap_dist as bd

In [29]:
config, cfn = crw.read(os.path.join("/wynton/home/ahituv/fongsl/EMF/US/", "config.ini"))

config_dict = {
    "CLEAN_SYN_LIB":CLEAN_SYN_LIB, 
    "CLEAN_DESIGN":CLEAN_DESIGN, 
    "HEPG2_MPRA_SYN":HEPG2_MPRA_SYN, 
    "BJ_MPRA_SYN":BJ_MPRA_SYN,
    "HEPG2_MPRA_SYN_BS":HEPG2_MPRA_SYN_BS, 
    "BJ_MPRA_SYN_BS":BJ_MPRA_SYN_BS, 
    
}

# make data section of config
section = "SYNTHETICS"
crw.check(config, section)

# add dictionary to config
for key, value in config_dict.items():
    config[section][key] = value
    
# write to config    
crw.write(config, cfn)

# functions for motif parsing
    - parse library fasta motif names

## PROBLEM: multiple motif names

    - SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_Pos:1
    - Neg_shuffle_158_SYNTHETIC:_The_motif_TAATT_was_added_in_Pos:85_and_the_motif_CAGGAAGG_was_added_in_Pos:185
    -SYNTHETIC:_Added_GCCCGGGGG,AGCAACCA,GGCCGTCTGG,GTATCAAAGT,GGATGAGTCATCG,CAAGTCAGCAATTTT,AATTA,TGTAACAA_at_positions_91,213,52,251,193,121,84,240

In [4]:
def get_motifs(seq_id):
    pos_motif ={} # collect motifs
    next_motif, next_pos = -1,-1
    
    # str split motif ugly string
    for n, i in enumerate(seq_id.split("_")):
        
        if i == "motif" or i == "Added":  # prepare to read the motif info, SOME IDS READ 'THE_MOTIF_XXX_WAS_ADDED_IN_POS:YYY', OTHERS READ "ADDED_XXX_AT__POSITIONS_Y1,Y2,Y3". WTF.
            next_motif=n+1
                
        elif n == next_motif:
            #print("motif", i)
            motif = i # motif 
            next_motif = 0 # reset next_motif 
            
        if i == "in" or i=="positions": # prepare to read the position info
            next_pos = n+1
            
        elif n == next_pos:  # store position(s) and motif(s)
            if 'Pos' in i:
                pos = i.split(':')[1] # position (could be multiple)
            else:
                pos = i
            if "," in motif:  # handle heterotypic insertion positions and motifs
                for motif_, pos_ in zip(motif.split(","), pos.split(",")):
                    #print('pos', pos_, "motif_", motif_)
                    pos_motif[pos_] = motif_
            else:
                for pos_ in pos.split(","):  # split on comma if there are multiple insertions
                
                    pos_motif[pos_] = motif # add the position and motif to the dictionary
                
            next_motif, next_pos = -1,-1  # reset counters
            motif, pos='',''  # reset motif and position values. 
    return pos_motif

## parse motifs

In [5]:
fa_dict={}
with open(FA, 'r') as fasta:
    for value in sfp(fasta):
        
        seq_id, seq = value
       
        if "SYN" in seq_id:
            
            motif_dict = get_motifs(seq_id)
        
            fa_dict[seq_id] = motif_dict

len(fa_dict.keys())

39180

# per sequence motifs to a dataframe

In [6]:
# parse and make dataframe of seq results
results = {}
for key, value in fa_dict.items():

    """annotate homotypic, heterotypic, or single insertions, 

    annotate shuffled sequences
    """
    d = pd.DataFrame(value.items(), columns=["pos", "motif"])

    d["shuf"], d["seq"] = False, key # columns to reannotate

    # str split synthetic sequences
    # synthetic
    if d.shape[0] < 2:
        d["insert_type"] = "single"

    # heterotypic
    elif "_and_the_motif" in key:
        d["insert_type"] = "het"
    # homotypic    
    elif "The" in key:
        d["insert_type"] = "homo"
        
    elif "_Added_" in key:
        d["insert_type"] = "het"

    # shuffled
    if "Neg_shuffle" in key:
        d["insert_type"] = "shuf-whole"
        d['shuf'] = True
    elif 'The_shuffled_version' in key:

        # need to avoid annotations like this - "SYNTHETIC:_The_shuffled_version_of_the_motif_GGGGCGGGG_as_GGGGCGGGG_was_added_in_Pos:85"
        original_motif = (key.split("_as_")[0]).split(
            "The_shuffled_version_of_the_motif_")[1]
        
        shuffled_motif = (key.split("_as_")[1]).split("_was_added_in_")[0]

        if original_motif == shuffled_motif:
            print(original_motif, shuffled_motif)
            d["insert_type"] = "single"
        else:
            d["insert_type"] = "shuf-motif"
            d['shuf'] = True

    results[key] = d

GGGGCGGGG GGGGCGGGG
CCCCGCCCC CCCCGCCCC
CCCCCCCC CCCCCCCC
GGGGGGGG GGGGGGGG
CCCCCCCCCCC CCCCCCCCCCC
GGGGGGGGGGG GGGGGGGGGGG
TAATT TAATT
AATTA AATTA


## add library

In [7]:
# make dataframe combining all sequences
lib = pd.concat(results.values()).reset_index(drop=True).drop_duplicates()
lib["pos"] = lib["pos"].astype(int)
print(lib.shape)
lib.head()

(196597, 5)


Unnamed: 0,pos,motif,shuf,seq,insert_type
0,1,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single
1,1,CTGTTCCTGG,False,SYNTHETIC:_The_motif_CTGTTCCTGG_was_added_in_P...,single
2,9,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single
3,9,CTGTTCCTGG,False,SYNTHETIC:_The_motif_CTGTTCCTGG_was_added_in_P...,single
4,17,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single


In [8]:
#
print("e.g. single", lib.loc[lib['insert_type']=="single"]["seq"].iloc[0])
print("e.g. homotypic", lib.loc[lib['insert_type']=="homo"]["seq"].iloc[0])
print("e.g. heterotypic", lib.loc[lib['insert_type']=="het"]["seq"].iloc[0])
print("e.g. shuf-whole", lib.loc[lib['insert_type']=="shuf-whole"]["seq"].iloc[0])
print("e.g. shuf-motif", lib.loc[lib['insert_type']=="shuf-motif"]["seq"].iloc[0])


e.g. single SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_Pos:1
e.g. homotypic SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_Pos:125,145
e.g. heterotypic SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_Pos:85_and_the_motif_ACACACACAC_was_added_in_Pos:185
e.g. shuf-whole Neg_shuffle_1_SYNTHETIC:_The_motif_ACTTTGATAC_was_added_in_Pos:41
e.g. shuf-motif SYNTHETIC:_The_shuffled_version_of_the_motif_CCAGGAACAG_as_GAAGCACACG_was_added_in_Pos:85


## compute features of fasta library

In [9]:
# count the number of motifs inserted
inserts = lib.groupby(["seq", 'insert_type'])['pos'].count().reset_index().copy().sort_values(by="pos")
inserts.rename(columns={"pos": "n_inserts"}, inplace=True)  # rename the count column

# add n_inserts into lib dataframe
lib = pd.merge(lib, inserts)  

# measure motif length
lib["motif_len"] = lib["motif"].apply(lambda x: len(x))

In [10]:
lib.head()

Unnamed: 0,pos,motif,shuf,seq,insert_type,n_inserts,motif_len
0,1,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single,1,10
1,1,CTGTTCCTGG,False,SYNTHETIC:_The_motif_CTGTTCCTGG_was_added_in_P...,single,1,10
2,9,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single,1,10
3,9,CTGTTCCTGG,False,SYNTHETIC:_The_motif_CTGTTCCTGG_was_added_in_P...,single,1,10
4,17,CCAGGAACAG,False,SYNTHETIC:_The_motif_CCAGGAACAG_was_added_in_P...,single,1,10


## write cleaned synthetic library to tsv

In [11]:
lib.rename(columns={"seq":"name"}).to_csv(CLEAN_SYN_LIB, sep='\t', index=False)

# design file

In [12]:
design = pd.read_csv(DESIGN)
design.columns=['design_cl', "design_assay", "design_direction", 
"design_motif", "motif", "motif_name", "p", "perc_target_w_motif", "perc_bkgd_w_motif", "quadrant"]

design.head()

Unnamed: 0,design_cl,design_assay,design_direction,design_motif,motif,motif_name,p,perc_target_w_motif,perc_bkgd_w_motif,quadrant
0,bj,k27ac,up,known,CCAGGAACAG,AR-halfsite(NR)/LNCaP-AR-ChIP-Seq(GSE27824)/Homer,1e-50,82.0,47.0,1
1,bj,k27ac,up,de novo,ACACACACAC,KLF9/MA1107.2/Jaspar(0.745),1e-55,84.0,47.0,1
2,bj,k27ac,down,known,GGCCGTCTGG,Smad4(MAD)/ESC-SMAD4-ChIP-Seq(GSE29422)/Homer,1e-06,76.0,24.0,1
3,bj,k27ac,down,de novo,AAGGCTGGGACC,POL011.1_XCPE1/Jaspar(0.642),1e-08,14.0,0.01,1
4,hepg2,k27ac,up,known,AGGCCTGG,ZFX(Zf)/mES-Zfx-ChIP-Seq(GSE11431)/Homer,1e-07,77.0,37.0,1


## reverse complement motifs

In [13]:
if "rev" not in design.columns:
    design["rev"] = 0  # annotate reverse complement in column
    
    rev = design.copy().drop_duplicates()  # copy design file
    
    rev["rev"] = 1  # re-annotate reverse complement (make binary column)
    rev["motif"] = rev["motif"].apply(lambda x: reverse_complement(x))  # re-annotate reverse complement in motif column 
    
    print(design.shape)
    design = pd.concat([design, rev]) # combine forward and reverse complement motifs
    print(design.shape)

(64, 11)
(128, 11)


## write clean design with reverse complement information

In [14]:
design.to_csv(CLEAN_DESIGN, sep='\t', index=False)

# MPRA 

## stupid function

In [22]:
"""
## PROBLEM: MOTIF names are annotated differently 
same sequence in library and mpra dataframe
    
    lib.loc[lib["seq"].str.contains("Neg_shuffle_156_SYNTHETIC:_"), "seq"].iloc[0]

    mpra.loc[mpra["name"].str.contains("Neg_shuffle_156_SYNTHETIC:_"), "name"].iloc[0]
    
library 
    
    Neg_shuffle_156_SYNTHETIC:_Added_ATGCTGAC,TTTTATAA,TGTAACAA,AATTA,GTATCAAAGT,CCCCGGCGCCCCCTGGTGGC_at_positions_6,146,87,257,187,230
mpra dataframe 

    Neg_shuffle_156_SYNTHETIC:_Added_ATGCTGAC_TTTTATAA_TGTAACAA_AATTA_GTATCAAAGT_CCCCGGCGCCCCCTGGTGGC_at_positions_6_146_87_257_187_230
"""

def stupidMPRAStrSplitID(seq_id):
    """position and motifs are not separated by commas, but underscores in the MPRA library file. 
    This is stupid. It took me a solid 30 minutes to figure this out and clean this up! F. 
    It does not match the fasta library file. 
    I have to very carefully change these annotations to match the library file and I think it is stupid. 
    In the future, there will be a better way of matching these ids with AN INDEX FOR EACH SEQUENCE ID. 
    WE DON"T HAVE TO LIVE UNDER ROCKS
    """
    full_id = []
    next_motif, next_pos = -1,-1
    motif, pos = [], []
    end_motif, end_pos = -1, -1
    for n, i in enumerate(seq_id.split("_")):
        
        if i == "Added":
            next_motif = n
            full_id.append(i) # add to str
        
        elif n > next_motif and next_motif >0 and i!="at":
            motif.append(i)  # add to str
        
        elif i=="at":
            full_id.append(",".join(motif)) # add to str
            full_id.append(i) # add to str
            
            next_motif =-1
            next_pos = n
            
        elif n >next_pos and next_pos>0 and i!="positions":
            
            pos.append(i)
        else:
            full_id.append(i) # add to str
    full_id.append(",".join(pos))
    lib_matched_id = "_".join(full_id)

    return lib_matched_id

def analyzeBootstrap(list, quantile):
    """process the bootstrapping with a list of values and a quantile, 
       be conservative—return the most extreme of the 2.5% estimate or the max 97.5% estimate
    """
    discrete, relative = bd.bootstrap(list, None, quantile)
   
    return discrete

## clean and merge MPRA

In [30]:
for CL in CLS:
    print(CL)

    bs_expectations = {}  # collect bootstrapped expectations

    # read input data
    MPRA = config["mpra"][f"{CL}.clean.trans.scaled"]

    # write
    MPRA_SYN = config_dict[f"{CL.upper()}_MPRA_SYN"]
    MPRA_SYN_BS = config_dict[f"{CL.upper()}_MPRA_SYN_BS"]

    # load mpra data
    mpra = pd.read_csv(MPRA, sep='\t')
    
    # keep only synthetics
    mpra = mpra.loc[mpra["name"].str.contains("SYNTHETIC:")]
    print(mpra.shape)

    # rectify stupid annotations not in the MPRA.
    # separate the good mpra annotations that match the fasta library (n=10721)
    fine = mpra.loc[mpra["name"].isin(lib["seq"])].copy()

    # from the bad mpra annotations that do not match the fasta library (n=28459)
    needs_resolve = mpra.loc[~mpra["name"].isin(lib["seq"])].copy()

    # resolve annotation issue with str splitting function.
    needs_resolve["name"] = needs_resolve["name"].apply(
        lambda x: stupidMPRAStrSplitID(x))

    resolved = pd.concat([fine, needs_resolve])  # combine the good annotations
    print(resolved.shape, mpra.shape)

    # merge lib with resolved mpra on seq name
    df = pd.merge(lib, resolved, how="left", left_on="seq", right_on="name")

    # drop nas
    df = df.loc[~df["delta.mean"].isna()]

    print("dropped nas", df.shape)
    
    # write 
    df.to_csv(MPRA_SYN, sep='\t', index=False)

    # bootstrap by position
    # - first check if there is a correlation between motif length and ctrl MPRA activity.
    # - Spot checking position 1, 60, 227 - there is no correlation between motif leng and activity.

    bs_positions = {}

    # bootstrap CI of activity per position, per insertion type
    for insertion in ["single", "het", 'shuf-motif']:
        print("bs", insertion)
        for pos in set(df["pos"]):
            # df of position and insertion 
            posdf = df.loc[(df["pos"] == pos) &
                           (df["insert_type"] == insertion)
                           ].copy()  # subset the dataframe

            # bs if observations are more than 30 (central limit theorem heuristic)
            if posdf.shape[0]>30:
    
                data_list = [pos]
                for n, activity_measure in enumerate(["l2.ratio.med.ctrl", "l2.ratio.med.us"]):
    
                    # get vector of activity values to bootstrap from
                    activity_vector = list(posdf[["seq", activity_measure]
                                                 ].drop_duplicates()[activity_measure])
    
                    # bootstrap the vector fo confidence intervals
                    bs025_lo, bs025_hi = analyzeBootstrap(activity_vector, 0.025)
                    bs975_lo, bs975_hi = analyzeBootstrap(activity_vector, 0.975)
    
                    # add datalist to dictionary for each activity measure
                    bs_positions[f"{insertion}.{pos}.{n}"] = data_list + \
                        [insertion, activity_measure, len(activity_vector),
                         bs025_lo, bs025_hi, bs975_lo,  bs975_hi]

    # concat results across positions
    bs_pos_results = pd.DataFrame(np.vstack(bs_positions.values()))
    bs_pos_results.columns = [
        "insert_type", "pos", "activity_measure", "vector_size", "bs025_lo", 'bs025_hi', "bs_975_lo", "bs_975_hi"]

    # write!
    bs_pos_results.to_csv(MPRA_SYN_BS, sep='\t', index=False)

hepg2
(31805, 25)
(31805, 25) (31805, 25)
dropped nas (148583, 32)
bs single
bs het
bs shuf-motif


  bs_pos_results = pd.DataFrame(np.vstack(bs_positions.values()))


bj
(31714, 24)
(31714, 24) (31714, 24)
dropped nas (147947, 31)
bs single
bs het
bs shuf-motif


  bs_pos_results = pd.DataFrame(np.vstack(bs_positions.values()))


In [33]:
bs_pos_results.groupby(["pos", ])["vector_size"].count()

pos
het           526
shuf-motif      2
single         68
Name: vector_size, dtype: int64

## annotate shuf-motif dist

In [None]:

###
# annotate bootstrapped 95% CI from shuffled motifs - most appropriate for single insertion model. But bad control because limited to one positional insert.
###


# isolate the shuffled distribution
shuf = df.loc[df["insert_type"] == "'shuf-motif'"].drop_duplicates().copy()

bs_shuf_exp = {}
for activity_measure in ["l2.ratio.med.ctrl", "l2.ratio.med.us", "delta.mean"]:

    # get vector of activity values to bootstrap from
    activity_vector = list(shuf[["seq", activity_measure]
                                ].drop_duplicates()[activity_measure])

    # bootstrap the vector fo confidence intervals
    bs025_lo, bs025_hi = analyzeBootstrap(activity_vector, 0.025)
    bs975_lo, bs975_hi = analyzeBootstrap(activity_vector, 0.975)

    bs_shuf_exp[activity_measure] = [
        bs025_lo, bs025_hi, bs975_lo,  bs975_hi]

# annotate elements that exceed the shuffle bootstrap
df['exceeds_shuf.delta'] = None
df.loc[df["delta.mean"] > bs_shuf_exp["delta.mean"]
       [-1], "exceeds_shuf.delta"] = "UPPER"
df.loc[df["delta.mean"] < bs_shuf_exp["delta.mean"]
       [0], "exceeds_shuf.delta"] = "LOWER"

# annotate elements that exceed bootstraps in either environment.
for col, activity_measure in zip(["outside_shuf95.ctrl", "outside_shuf95.us"],
                                 ["l2.ratio.med.ctrl", "l2.ratio.med.us", ]):
    df[col] = 0

    # mark sequences that exceed the bootstrapped CIs
    df.loc[(df[activity_measure] > bs_shuf_exp[activity_measure][-1]) |
           (df[activity_measure] < bs_shuf_exp[activity_measure][0]), col] = 1

    if col == "outside_shuf95.ctrl":
        df[col] = df[col]*-1
# if exceeds any ctrl or us difference, sum the bs exceeding labels -
# if exceeds both us and ctrl, mark as zero because we're interested in gains or loss due to US sensitivity.
df["outside_shuf95.dif"] = df["outside_shuf95.ctrl"]+df["outside_shuf95.us"]



# write bootstrapped dataframe to file.
pd.DataFrame(bs_shuf_exp.items()).to_csv(
    f"{CL}.shuf.bs.tsv", sep='\t', index=False)