In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns

LOCAL = True
if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/"
    META_DATA= os.path.join(PATH, "lib2.meta_data.tsv")
    TILE_BED = os.path.join(PATH, "lib2.processed.bed")
    ENH_BED = os.path.join(PATH,'lib2.processed.merged.bed')
                         
import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# make tile bed

In [None]:
df = pd.read_csv(META_DATA, sep='\t')

In [11]:
bed = df.loc[df["#chr"].str.contains("chr")]
print(bed.shape)
bedcols=["#chr", "start", "end", "name"]
bed[bedcols].drop_duplicates().sort_values(by=bedcols).to_csv(TILE_BED, sep='\t', index=False)

(43370, 12)


# merge tiles w/ loci 

In [12]:
m = pb.BedTool(TILE_BED)
merged = m.sort().merge()
m.intersect(merged, wa=True, wb=True, output=ENH_BED)

chr1	10057	10563

chr1	10057	10563



<BedTool(/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/lib2.processed.merged.bed)>

# add enhancer id and tile order

In [13]:
def assignTileOrder(df):
    
    """ assign tile order within each peak, return dataframe"""
    dfs = {}
    for enh_id in df["enh.id"].unique():
        # enh dataframe
        test = df.loc[df["enh.id"] ==enh_id].copy()
        
        # sort by smallest start coordinate
        test = test.sort_values(by= ["#chr", "start.tile", "end.tile"]).reset_index()
        
        # drop index column
        test = test.drop(columns = ["index"])
        
        # add tile.order
        test["tile.order"] = test.index.map(str)
    
        # add to dictionary
        dfs[enh_id] = test
        
    return pd.concat(dfs.values())

In [14]:
if "#chr" not in list(pd.read_csv(ENH_BED, sep='\t', nrows=1)):
    
    merge = pd.read_csv(ENH_BED, sep='\t', header=None)
    merge.columns=["#chr", "start.tile", "end.tile", "coor", "#chr.enh", "start.enh", "end.enh"]
    
    # enhancer only dataframe
    enh = merge[[ "#chr.enh", "start.enh", "end.enh"]].drop_duplicates().copy().reset_index()
    
    # make enhancer coordinate id
    enh['enh.id'] = enh['#chr.enh'] + ":" + enh["start.enh"].map(str) + "-" + enh["end.enh"].map(str)
    
    # make enhancer name id
    enh['enh.name'] = "enh." + enh.index.map(str)
    
    # drop the index colunm
    enh = enh.drop(columns = ["index"])
    
    # add enhancer info back to tile info
    merge = pd.merge( merge, enh, how="left")
    
    # add tile order
    merge = assignTileOrder(merge)
    
    # save
    merge.to_csv(ENH_BED, sep='\t', index=False)
else:
     merge = pd.read_csv(ENH_BED, sep='\t')

merge.head()

Unnamed: 0,#chr,start.tile,end.tile,coor,#chr.enh,start.enh,end.enh,enh.id,enh.name,tile.order
0,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,0
1,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,1
2,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,2
3,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,3
4,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,4


In [15]:
len(set(merge["enh.id"]))

1827