# process the MPRA library fasta
- make:
    - metadata.tsv
    - tiles.bed
    - full.bed  # merged enhancers

In [1]:
LOCAL = False

from Bio.SeqIO.FastaIO import SimpleFastaParser

import matplotlib.pyplot as plt
import numpy as np
import os, sys

import pandas as pd
import pybedtools as pbt

import seaborn as sns
from scipy import stats
from sklearn.preprocessing import RobustScaler, StandardScaler

import sys

In [2]:
if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    DATA_PATH = "/Users/sarahfong/Desktop/local_data/EMF/US"
else:
    DATA_PATH = "/wynton/group/ahituv/fongsl/projects/US/data"
import config_readwrite as crw
import plot_params as pp

pp.fonts()

('sans-serif', 'Arial', 18)

# config

In [3]:
# read
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))


crw.write(config, cfn)    
# make dictionary of values to write to config
config_dict = {
    
    "FASTA":os.path.join(DATA_PATH, "ultrasound_final_no_adapter.fa"), 
    "META":os.path.join(DATA_PATH, "ultrasound_final_no_adapter.metadata.tsv"), # write
    "TILE": os.path.join(DATA_PATH, "genome_loci", "tiles.bed"),  # write
    "ENH": os.path.join(DATA_PATH, "genome_loci", "full.bed"),  # write
    "TILE_x_ENH": os.path.join(DATA_PATH, "genome_loci", "tiles.x.full.bed"),   # write
    "ORDERED_TILES": os.path.join(DATA_PATH, "genome_loci", "tiles.x.full.ordered.bed") # write
}

# make data section of config
section = "data"
crw.check(config, section)

# add dictionary to config
for key, value in config_dict.items():
    config[section][key.lower()] = value
    
# write to config    
crw.write(config, cfn)

# functions

In [4]:
def makeCoorLabel(df, col_list, id_tag):
    """make genomic coordinate labels for dataframe. 
    input
        df (dataframe) - df to label
        col_list (list) - list of columns to combine, ordered by chr, start, end 
        id_tag (str) - name to call this coordinate (e.g. enh.coor, tile.coor, etc. )
        """
    df = df.sort_values(by=col_list)
    
    df[f"{id_tag}.coor"] = df[col_list[0]] + ":" + df[col_list[1]].map(str) + "-" + df[col_list[2]].map(str)
    
    return df

def makeCoorIndex(df, col_list, id_tag, cl=None):
    """sort dataframe, label 'enh.1', 'enh.2', etc."""
    
    df = df.sort_values(by=col_list).reset_index(drop=True) # sort and reindex

    if cl is not None:
        
        df[f"{id_tag}.name"] = id_tag + "." + cl +"."+ df.index.map(str) # label
    else:
        df[f"{id_tag}.name"] = id_tag +"."+ df.index.map(str) # label

    return df

def orderTilesInEnh(df, cols):
    """order tiles given enhancer id, and tile coordinates by subsetting each enhancer locus, then sorting by tile coordinates. 
    inputs
        df (dataframe) - dataframe to order
        cols (list) - list of columns to label tile order. ORDER must be enh.name, #chr_tile, start_tile, end_tile
    """
    ordered_enh = {}

    # subset only enhancer and tile coordinates
    
    # sort by tile_coordinates
    enh_tilecoor = txe[cols].drop_duplicates().copy().sort_values(by=cols[1:])
    
    # per enhancer, annotate tile order from seq.id
    for enh_name in set(enh_tilecoor[cols[0]]):
    
        # subset enh.name info and SORT by seq.id
        test = enh_tilecoor.loc[enh_tilecoor[cols[0]]==enh_name].copy().sort_values(by=cols[1:]).reset_index(drop=True)
        test["tile.order"] = test.index  # index tile order
        ordered_enh[enh_name]= test
    
    return pd.concat(ordered_enh.values()) #combin the data



def writeBed(df, out_bed):
    """write bed file, cl origin"""
    bedwriter = open(out_bed, "w") # write cl-specific bed

    for row in df.iterrows():
        
        # parse name
        name, coor, cl = row[1]["name"], row[1]["coor"], row[1]["cl.origin"]
    
        # get coordinates
        chr_= coor.split(":")[0]
        start= (coor.split(":")[1]).split("-")[0]
        end= (coor.split(":")[1]).split("-")[1]
    
        # write hg38 coordinates only
        if "hg19" not in end:
    
            bedwriter.write(f"{chr_}\t{start}\t{end}\t{name}\t{cl}\n")
        
    bedwriter.close()


# Input/output files

In [5]:
FASTA = config_dict["FASTA"] # read
META = config_dict["META"] # write
TILE = config_dict["TILE"]
ENH = config_dict["ENH"]
TILExENH = config_dict["TILE_x_ENH"]
ORDERED_TILES = config_dict["ORDERED_TILES"]

## read Fasta

In [6]:
fasta_dict = {}
with open(FASTA, "r") as reader:
    for value in SimpleFastaParser(reader):
        seq_id, seq = value
        fasta_dict[seq_id] = seq
print("n oligos", len(fasta_dict.keys()))

n oligos 82427


## make fasta dataframe and format columns

In [7]:
df = pd.DataFrame(fasta_dict.items()) # make df

df.columns = ["name", "seq"] # rename columns

###
# parse information from name field
###

# cl used to design element
df["cl.origin"] = df["name"].apply(lambda x: x.split("_")[0].lower())

# experiment to test
df["exp"] = df["name"].apply(lambda x: x.split("_")[1].lower()) # the experiment

# coordinate
df["coor"] = df["name"].apply(lambda x: "chr" + x.split("chr")[1].lower() if "chr" in x else None) # the experiment

# label direction
df["direction"] = None
for d in ['up', 'down', "Pos", "Neg"]:
    if d == "Pos":
        df.loc[(df["name"].str.contains(d))&
            (df['cl.origin']=="pos"), "direction"] = d
   
    else:
        df.loc[df["name"].str.contains(d), "direction"] = d

###
# rename some columns for clarity
###

# cl.origin
df.loc[df["cl.origin"] == "synthetic:", "cl.origin"] = "synthetic"

# experiment values
df.loc[df["exp"]=="bj", "exp"] = "luc"
df.loc[df["exp"]=="the", "exp"] = "synthetic" # not het
df.loc[df["exp"]=="added", "exp"] = "synthetic" #het

# write bed of genomic coordinates

In [8]:
# genomic coordinates only, no shuffles
genomic = df.loc[(~df["coor"].isna()) &
                 (~df["name"].str.contains("shuf")),
                 ["name", "coor", "cl.origin"]
                 ].copy().drop_duplicates()
print(genomic.shape)

(42847, 3)


### write all tiles

In [9]:
writeBed(genomic, TILE)

### write cl-specific tiles

In [35]:
TILES = []

for cl in genomic["cl.origin"].unique():
    
    # write cl-specific tile.bed
    CL_TILE = TILE.strip(".bed") + f".{cl}.bed"
    CL_ENH = ENH.strip(".bed") + f".{cl}.bed"
    CL_TILExENH = TILExENH.strip(".bed") + f".{cl}.bed"

    TILES.append((cl, CL_TILE, CL_ENH, CL_TILExENH)) # append to list for separate merges
    
    print(cl, CL_TILE)

    genomic_cl = genomic.loc[genomic["cl.origin"]==cl].copy().drop_duplicates()# subset dataframe

    # iter through rows and write bed file
    writeBed(genomic_cl, CL_TILE)
     

bj /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.bj.bed
hepg2 /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.hepg2.bed
hob /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.hob.bed
k562 /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.k562.bed
neg /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.neg.bed
pos /wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.pos.bed


# merge enhancer loci

In [36]:
# all loci, regardless of cl origin
tiles = pbt.BedTool(TILE).sort()

merged = tiles.merge(output=ENH, c=5, o='distinct').sort()

# intersect enhancer data back.
tiles.intersect(merged, wa=True, wb=True, output=TILExENH)

FileNotFoundError: File "/wynton/group/ahituv/fongsl/projects/US/data/genome_loci/tiles.bed" does not exist

In [37]:
# per cell line
enh_df, tile_df, ordered = {}, {}, {}
for cl, tile_fn, enh_fn, out_fn in TILES:
    print(cl)

    #pbtool load tiles
    tiles = pbt.BedTool(tile_fn).sort()

    #merge tile loci to make enhancer
    merged = tiles.merge(output=enh_fn).sort()

    # intersect enhancer data back with tile information
    tiles.intersect(merged, wa=True, wb=True, output=out_fn)

    # enhancer data formatting
    tag = "enh"
    cols = [f"#chr_{tag}", f"start_{tag}", f"end_{tag}"]
    file = enh_fn
    
    if "#chr" not in list(pd.read_csv(file, sep='\t', nrows=1)):
        enh = pd.read_csv(file, sep='\t', header=None)
        enh.columns = cols
    
        enh = makeCoorLabel(enh, cols, f"enh")
        enh = makeCoorIndex(enh, cols, f"enh", cl=cl)
        enh.to_csv(file, sep='\t', index=False)
    else:
        enh = pd.read_csv(file, sep='\t')
    enh_df[cl] = enh

    # tile data formatting
    tag = "tile"
    cols = [f"#chr_{tag}", f"start_{tag}", f"end_{tag}", "name", "cl.origin"]
    file = tile_fn
    
    if cols[0] not in list(pd.read_csv(file, sep='\t', nrows=1)):
        tile = pd.read_csv(file, sep='\t', header=None)
        tile.columns = cols
    
        tile = makeCoorLabel(tile, cols, f"{tag}")
        tile = makeCoorIndex(tile, cols, f"{tag}", cl=cl)
        tile.to_csv(file, sep='\t', index=False)
    else:
        tile = pd.read_csv(file, sep='\t')
    tile_df[cl]=tile

    # tile and enhancer data together
    
    tag1, tag2 = "tile", "enh"
    cols = [f"#chr_{tag1}", f"start_{tag1}", f"end_{tag1}", "name", "cl.origin",
            f"#chr_{tag2}", f"start_{tag2}", f"end_{tag2}"]
    file = out_fn
    
    if cols[0] not in list(pd.read_csv(file, sep='\t', nrows=2)):
        txe = pd.read_csv(file, sep='\t', header=None)
        print(txe.shape)
        txe.columns = cols # rename columns

    
        txe = makeCoorLabel(txe, cols[:3], tag1)
        txe = makeCoorLabel(txe, cols[-3:], tag2)
    
        # add enh.name
        txe = pd.merge(txe, enh[["enh.coor", "enh.name"]]).drop_duplicates()
    
        # add tile.name
        txe = pd.merge(txe, tile[["tile.coor", "tile.name"]]).drop_duplicates()
    
        # order tiles and enhancers
        enh_order = orderTilesInEnh(txe, ['enh.name', "#chr_tile", "start_tile", "end_tile", "name"])
    
        # add tile.order
        txe = pd.merge(txe, enh_order)
    
        # write files
        txe.to_csv(file, sep='\t', index=False)
        out_ordered = ORDERED_TILES.strip(".bed") + f'.{cl}.bed'
        enh_order.to_csv(out_ordered, sep='\t', index=False)
    else:
        txe = pd.read_csv(file, sep='\t')
    ordered[cl]=txe

bj
(3648, 8)
hepg2
(6292, 8)
hob
(29340, 8)
k562
(3189, 8)
neg
(160, 8)
pos
(18, 8)


In [39]:
enh = pd.concat(enh_df.values())
tile = pd.concat(tile_df.values())
txe = pd.concat(ordered.values())

# write combined cl information

In [53]:
tile.sort_values(by=list(tile.columns[:3])).to_csv(TILE, sep='\t', index=False)
enh.sort_values(by=list(enh.columns[:3])).to_csv(ENH, sep='\t', index=False)
txe.sort_values(by=list(txe.columns[:3])).to_csv(TILExENH, sep='\t', index=False)

In [50]:
txe.loc[txe["enh.name"]=="enh.bj.298"].sort_values(by="cl.origin")

Unnamed: 0,#chr_tile,start_tile,end_tile,name,cl.origin,#chr_enh,start_enh,end_enh,tile.coor,enh.coor,enh.name,tile.name,tile.order
1134,chr14,69288047,69288316,bj_atac_down_chr14:69288047-69288316,bj,chr14,69288047,69288448,chr14:69288047-69288316,chr14:69288047-69288448,enh.bj.298,tile.bj.1134,0
1135,chr14,69288114,69288383,bj_atac_down_chr14:69288114-69288383,bj,chr14,69288047,69288448,chr14:69288114-69288383,chr14:69288047-69288448,enh.bj.298,tile.bj.1135,1
1136,chr14,69288179,69288448,bj_atac_down_chr14:69288179-69288448,bj,chr14,69288047,69288448,chr14:69288179-69288448,chr14:69288047-69288448,enh.bj.298,tile.bj.1136,2


## write META

In [51]:
df = pd.merge(df, txe, how="left").sort_values(by=list(txe.columns[:3]))

df.to_csv(META, sep='\t', index=False)

In [52]:
df.head()

Unnamed: 0,name,seq,cl.origin,exp,coor,direction,#chr_tile,start_tile,end_tile,#chr_enh,start_enh,end_enh,tile.coor,enh.coor,enh.name,tile.name,tile.order
9687,hob_k27ac_down_chr1:826978-827247,TTGTTACAGGATCGGGCAGGTCCCCTACCCCAGTCTCGGACTCAGG...,hob,k27ac,chr1:826978-827247,down,chr1,826978.0,827247.0,chr1,826978.0,827704.0,chr1:826978-827247,chr1:826978-827704,enh.hob.0,tile.hob.0,0.0
9689,hob_k27ac_down_chr1:827110-827379,AGGACTGGCGTCTGCCGAATCCCAGGGCTGCCCTGAGGGGCCAAGA...,hob,k27ac,chr1:827110-827379,down,chr1,827110.0,827379.0,chr1,826978.0,827704.0,chr1:827110-827379,chr1:826978-827704,enh.hob.0,tile.hob.1,1.0
9690,hob_k27ac_down_chr1:827377-827646,GCCGAGCCTGTGACATCCGCGGAGACCAGCAGACCCCGGGTGTGGA...,hob,k27ac,chr1:827377-827646,down,chr1,827377.0,827646.0,chr1,826978.0,827704.0,chr1:827377-827646,chr1:826978-827704,enh.hob.0,tile.hob.2,2.0
9686,hob_k27ac_down_chr1:827435-827704,GAGGGGACTGCGTGGCTGGGTTTGGCCACAAAAAGCGGAGGGCACT...,hob,k27ac,chr1:827435-827704,down,chr1,827435.0,827704.0,chr1,826978.0,827704.0,chr1:827435-827704,chr1:826978-827704,enh.hob.0,tile.hob.3,3.0
9688,hob_k27ac_down_chr1:827891-828160,GAGCCCGCACTCCGCCTCTGGGTAGCAGCCTCTTCGGCCCCACACG...,hob,k27ac,chr1:827891-828160,down,chr1,827891.0,828160.0,chr1,827891.0,828160.0,chr1:827891-828160,chr1:827891-828160,enh.hob.1,tile.hob.4,0.0
