# sarahfong
# 20240515

- Goal: order the tiles for each enhancer element in library
- (1) make enhancers by merging tile library (excluding synthetic elements) together
- (2) number tile order for consecutive tiles. 

- note: MPRA will not test all of these tiles. 

In [1]:
LOCAL = False

from Bio.SeqIO.FastaIO import SimpleFastaParser
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns


if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin/"
    RE = os.path.join(PATH, "results")
    FASTA = os.path.join(PATH, "library_2", "Design", "biomarin-lib2-hg38-final.fasta")
else:
    PATH="/wynton/group/ahituv/fongsl/projects/biomarin/data"
    RE = "/wynton/group/ahituv/fongsl/projects/biomarin/results"
    FASTA = os.path.join(PATH, "biomarin-lib2-hg38-final.fa")
    
    
META_DATA= os.path.join(PATH, "lib2.meta_data.tsv")

# write
TILE_BED = os.path.join(PATH, "lib2.processed.bed")
ENH_BED = os.path.join(PATH,'lib2.processed.merged.bed')
                         
import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# make tile bed

In [2]:
df = pd.read_csv(META_DATA, sep='\t')

df['#chr'].unique()

array(['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21',
       'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
       'chr:', 'chrX', 'chrY'], dtype=object)

In [3]:
df.loc[df["#chr"].map(str)=="nan"]

Unnamed: 0,name,sequence,bkgd,72h,ctrl,cl,coor,top_bottom,strand,#chr,start,end,enh.id,enh.name,tile.order


In [4]:
# sequences that don't have a chr
df["#chr"] = df["#chr"].fillna("0")

bed = df.loc[df["#chr"].str.contains("chr")]
print(bed.shape)
bedcols=["#chr", "start", "end", "name"]

print(bed.loc[bed["start"]>0].shape[0])

# remove anu of the synthetic sequences (where start = -1)
bed.loc[bed["start"]>0, bedcols].drop_duplicates().sort_values(by=bedcols).to_csv(TILE_BED, sep='\t', index=False)

(44044, 15)
43370


# merge tiles w/ loci 

## (1) merge tiles and enhancer loci
-  enhancer loci = sort and merge tiles 
- merge = tiles.intersect(enh)

In [5]:
tile = pb.BedTool(TILE_BED) # tile loci
enh_merge = tile.sort().merge() # enhancer loci - merge the tiles together
tile_enh_merge = tile.intersect(enh_merge, wa=True, wb=True, output=ENH_BED) # intersect tiles with enhancers

# add enhancer id and tile order

In [6]:
def assignTileOrder(df):
    
    """ assign tile order within each peak, return dataframe"""
    dfs = {}
    for enh_id in df["enh.id"].unique():
        # enh dataframe
        test = df.loc[df["enh.id"] ==enh_id].copy()
        
        # sort by smallest start coordinate
        test = test.sort_values(by= ["#chr", "start.tile", "end.tile"]).reset_index()
        
        # drop index column
        test = test.drop(columns = ["index"])
        
        # add tile.order
        test["tile.order"] = test.index.map(str)
    
        # add to dictionary
        dfs[enh_id] = test
        
    return pd.concat(dfs.values())

In [7]:
def labelEnhID(merge):
    """take only enhancer coordinates and label those coordinates, 
    return enh labels within original dataframe
    """

    # enhancer only dataframe
    enh = merge[[ "#chr.enh", "start.enh", "end.enh"]].drop_duplicates().copy().reset_index()
    
    # make enhancer coordinate id
    enh['enh.id'] = enh['#chr.enh'] + ":" + enh["start.enh"].map(str) + "-" + enh["end.enh"].map(str)
    
    # make enhancer name id
    enh['enh.name'] = "enh." + enh.index.map(str)
    
    # drop the index colunm
    enh = enh.drop(columns = ["index"])
    
    # add enhancer info back to tile info
    merge = pd.merge( merge, enh, how="left")

    return merge

## (2) add tile order, write tile + enh + tile.order dataframe

In [8]:
if "#chr" not in list(pd.read_csv(ENH_BED, sep='\t', nrows=1)):
    
    merge = pd.read_csv(ENH_BED, sep='\t', header=None)
    merge.columns=["#chr", "start.tile", "end.tile", "coor", "#chr.enh", "start.enh", "end.enh"]
    
    merge = labelEnhID(merge) # label enhancer ids
    
    # add tile order
    merge = assignTileOrder(merge)
    

    # add original library back in, recover some lost synthetic tiles
    merge =  pd.merge(merge, df[["name", "coor"]], how='right')
    
    # save
    merge.to_csv(ENH_BED, sep='\t', index=False)
else:
     merge = pd.read_csv(ENH_BED, sep='\t')

merge.head()

Unnamed: 0,#chr,start.tile,end.tile,coor,#chr.enh,start.enh,end.enh,enh.id,enh.name,tile.order,name
0,chr1,10057.0,10327.0,chr1:10057-10327,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,0,chr1:10057-10327
1,chr1,10077.0,10347.0,chr1:10077-10347,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,1,chr1:10077-10347
2,chr1,10097.0,10367.0,chr1:10097-10367,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,2,chr1:10097-10367
3,chr1,10117.0,10387.0,chr1:10117-10387,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,3,chr1:10117-10387
4,chr1,10133.0,10403.0,chr1:10133-10403,chr1,10057.0,10563.0,chr1:10057-10563,enh.0,4,chr1:10133-10403


# make SHARPR file

In [None]:
merg