In [2]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import numpy as np
import os, sys
import pandas as pd
import pybedtools as pb
import matplotlib.pyplot as plt
import seaborn as sns

LOCAL = False
if LOCAL is True:
    sys.path.append("/Users/sarahfong/tools/py_")
    PATH = "/Users/sarahfong/Desktop/local_data/Biomarin_Jun_2023/"
    RE = os.path.join(PATH, "results")
    FASTA = os.path.join(PATH, "library_2", "Design", "biomarin-lib2-hg38-final.fasta")
else:
    PATH="/wynton/group/ahituv/biomarin/data"
    RE = "/wynton/group/ahituv/biomarin/results"
    FASTA = os.path.join(PATH, "biomarin-lib2-hg38-final.fa")
    
    
META_DATA= os.path.join(PATH, "lib2.meta_data.tsv")

# write
TILE_BED = os.path.join(PATH, "lib2.processed.bed")
ENH_BED = os.path.join(PATH,'lib2.processed.merged.bed')
                         
import config_readwrite as crw
import plot_params as pp
pp.fonts()

('sans-serif', 'Arial', 18)

# make tile bed

In [16]:
df = pd.read_csv(META_DATA, sep='\t')

df.head()

df['#chr'].unique()

array(['Background seq2 chr1', nan, 'chr1', 'chr10', 'chr11', 'chr12',
       'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
       'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6',
       'chr7', 'chr8', 'chr9', 'chrX', 'chrY'], dtype=object)

In [18]:
df.loc[df["#chr"].map(str)=="nan"]

Unnamed: 0,name,sequence,bkgd,h,ctrl,cl,coor,top_bottom,strand,#chr,start,end
648,Background seq1 72hr_top_98|Pos:135|Motif fami...,AGGACCGGATCAACTCTACTCCTGTGCGGGGTTAAGACCTAAGGAA...,False,,,,,top,+,,,
649,"Background seq1 72hr_top_98|Pos:115,155|Motif ...",AGGACCGGATCAACTCTACTCCTGTGCGGGGTTAAGACCTAAGGAA...,False,,,,,top,+,,,
650,"Background seq1 72hr_top_98|Pos:105,135,165|Mo...",AGGACCGGATCAACTCTACTCCTGTGCGGGGTTAAGACCTAAGGAA...,False,,,,,top,+,,,
651,"Background seq1 72hr_top_98|Pos:105,125,145,16...",AGGACCGGATCAACTCTACTCCTGTGCGGGGTTAAGACCTAAGGAA...,False,,,,,top,+,,,
652,Background seq1 72hr_top_98|Pos:135|Motif fami...,AGGACCGGATCAACTCTACTCCTGTGCGGGGTTAAGACCTAAGGAA...,False,,,,,top,+,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1317,Negative-GLUT_72hr_top_102,AGGACCGGATCAACTCCCCCCAGGCAAGGGCCTCTAACCCTGGGGC...,False,72h,negative,GLUT,,top,,,,
1318,Negative-GLUT_72hr_bottom_113,AGGACCGGATCAACTTGCTGGCTGCCTGGGTCGGAGTCGACGCCAC...,False,72h,negative,GLUT,,bottom,,,,
1319,Negative-GLUT_72hr_top_53,AGGACCGGATCAACTGGCAGTCAGGTTCGGGTCGCGGTCAGAGTAC...,False,72h,negative,GLUT,,top,,,,
1320,Negative-GLUT_72hr_top_52,AGGACCGGATCAACTGAAGTCACGGTCACCAGGCTCCAAACCGAGG...,False,72h,negative,GLUT,,top,,,,


In [21]:
# sequences that don't have a chr
df["#chr"] = df["#chr"].fillna("0")

bed = df.loc[df["#chr"].str.contains("chr")]
print(bed.shape)
bedcols=["#chr", "start", "end", "name"]
bed[bedcols].drop_duplicates().sort_values(by=bedcols).to_csv(TILE_BED, sep='\t', index=False)

(43370, 12)


# merge tiles w/ loci 

In [22]:
m = pb.BedTool(TILE_BED)
merged = m.sort().merge()
m.intersect(merged, wa=True, wb=True, output=ENH_BED)

BEDToolsError: 
Command was:

	bedtools sort -i /wynton/group/ahituv/biomarin/data/lib2.processed.bed

Error message was:
Unexpected file format.  Please use tab-delimited BED, GFF, or VCF. Perhaps you have non-integer starts or ends at line 2?


# add enhancer id and tile order

In [23]:
def assignTileOrder(df):
    
    """ assign tile order within each peak, return dataframe"""
    dfs = {}
    for enh_id in df["enh.id"].unique():
        # enh dataframe
        test = df.loc[df["enh.id"] ==enh_id].copy()
        
        # sort by smallest start coordinate
        test = test.sort_values(by= ["#chr", "start.tile", "end.tile"]).reset_index()
        
        # drop index column
        test = test.drop(columns = ["index"])
        
        # add tile.order
        test["tile.order"] = test.index.map(str)
    
        # add to dictionary
        dfs[enh_id] = test
        
    return pd.concat(dfs.values())

In [24]:
if "#chr" not in list(pd.read_csv(ENH_BED, sep='\t', nrows=1)):
    
    merge = pd.read_csv(ENH_BED, sep='\t', header=None)
    merge.columns=["#chr", "start.tile", "end.tile", "coor", "#chr.enh", "start.enh", "end.enh", "name"]
    
    # enhancer only dataframe
    enh = merge[[ "#chr.enh", "start.enh", "end.enh"]].drop_duplicates().copy().reset_index()
    
    # make enhancer coordinate id
    enh['enh.id'] = enh['#chr.enh'] + ":" + enh["start.enh"].map(str) + "-" + enh["end.enh"].map(str)
    
    # make enhancer name id
    enh['enh.name'] = "enh." + enh.index.map(str)
    
    # drop the index colunm
    enh = enh.drop(columns = ["index"])
    
    # add enhancer info back to tile info
    merge = pd.merge( merge, enh, how="left")
    
    # add tile order
    merge = assignTileOrder(merge)
    
    # add name column back in, recover somem lost synthetic tiles
    merge = pd.merge(df[["name", "coor"]], merge, how='left')
    
    # save
    merge.to_csv(ENH_BED, sep='\t', index=False)
else:
     merge = pd.read_csv(ENH_BED, sep='\t')

merge.head()

Unnamed: 0,#chr,start.tile,end.tile,coor,#chr.enh,start.enh,end.enh,enh.id,enh.name,tile.order
0,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,0
1,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,1
2,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,2
3,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,3
4,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893:+|Pos...,Background seq2 chr1,244100624,244100893,Background seq2 chr1:244100624-244100893,enh.0,4
