In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import os, sys
import config_readwrite as crw
import pandas as pd
import pybedtools as pbt

In [3]:
config, cfn = crw.read(os.path.join(os.path.dirname(os.getcwd()), "config.ini"))

FASTA = config['data']["fasta"]
BED = os.path.split(FASTA)[0] + "/tiles.bed"

ATAC = "/wynton/home/ahituv/fongsl/EMF/US/data/atac_diff.bed"
K27AC = "/wynton/home/ahituv/fongsl/EMF/US/data/k27ac_diff.bed"
BOTH = "/wynton/home/ahituv/fongsl/EMF/US/data/atac_k27ac_diff.bed"
ALL = "/wynton/home/ahituv/fongsl/EMF/US/data/tiles.x.atac_k27ac_diff.bed"

config['data']["bed.atac"] = ATAC
config['data']["bed.k27ac"] = K27AC
config['data']["bed.atac.k27ac"] = BOTH
config['data']["bed.atac.k27ac.tiles"] = ALL

config['data']["bed"] = BED
crw.write(config, cfn)    

In [4]:
def cleanUp(variable):
    
    if "_hg19" in variable:
        variable = variable.split("_hg19")[0]
    
    return variable

# FASTA -> bed

In [19]:
bedwriter = open(BED, "w")
mystery = {}
with open(FASTA, 'r') as in_handle:
    for v in SimpleFastaParser(in_handle):

        name, seq = v
    
        if "chr" not in name:
            
            mystery[name]=seq
        elif "_motif_" in name:
            print(name)
            mystery[name]=seq
        elif len(name.split("chr"))>1:
            coor = "chr" + name.split("chr")[1]
            chr_ = coor.split(':')[0]
            start = (coor.split(":")[1]).split("-")[0]
            end= (coor.split(":")[1]).split("-")[1]

            chr_ = cleanUp(chr_)
            start = cleanUp(start)
            end = cleanUp(end)

            bedwriter.write(f"{chr_}\t{start}\t{end}\t{name}\n")
    bedwriter.close()
in_handle.close()

# sort synthetic, positive, negative 

In [26]:
synthetic = {}
pos = {}
neg = {}
for k, v in mystery.items():
    if 'SYNTHETIC' in k:
        synthetic[k]=v.upper()
    elif "Pos" in k:
        pos[k]=v.upper()
    elif "Neg" in k:
        neg[k] =v.upper()
    else:
        print(k,v)


## merge ATAC + H3K27ac

- 43047 enhancers with ATAC, H3K27ac annotations

In [None]:
cmd = ['cat', ATAC, K27AC, ">", BOTH]
if os.path.exists(BOTH) is False:
    os.system(" ".join(cmd))

## intersect tiles x enhancers
- N= 43989 tiles + enhancers 

In [None]:
if os.path.exists(ALL) is False:
    tiles, enh = pbt.BedTool(BED), pbt.BedTool(BOTH)
    
    tiles.intersect(enh, wa =True, wb=True, output = ALL)
    # n = 43989 tiles + 10639 enhancers 

# add enhancer id to endogenous sequences

In [52]:
if "enh_id" not in list(pd.read_csv(ALL, sep='\t', nrows=1)):
    df = pd.read_csv(ALL, sep='\t', 

                 header=None, 
                 names=["#chr", "start", "end", "name", "#chr_", "start_", "end_", "id"],
                ).drop_duplicates()
    print(df.shape)
    
    # annotate shuffles
    df["shuf"] = False
    df.loc[df["name"].str.contains("shuffle"), "shuf"] = True

    ## add enhancer id

    enh = df[["#chr_", "start_", "end_", "id"]].drop_duplicates().copy()

    # add enhancer id
    enh["enh_id"] = enh["#chr_"] + ':' + enh["start_"].map(str) + "-" + enh["end_"].map(str) + "|" + enh["id"]
    enh.head()

    df = pd.merge(df, enh, how="left")

    df.to_csv(ALL, sep='\t', index = False)
else:
    df =pd.read_csv(ALL, sep='\t')
df.head()

Unnamed: 0,#chr,start,end,name,#chr_,start_,end_,id,enh_id
0,chr3,29837526,29837795,bj_k27ac_down_chr3:29837526-29837795,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down
1,chr3,29837260,29837529,bj_k27ac_down_chr3:29837260-29837529,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down
2,chr3,29837792,29838061,bj_k27ac_down_chr3:29837792-29838061,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down
3,chr3,29837392,29837661,bj_k27ac_down_chr3:29837392-29837661,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down
4,chr3,29837659,29837928,bj_k27ac_down_chr3:29837659-29837928,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down


In [54]:
df.head()

Unnamed: 0,#chr,start,end,name,#chr_,start_,end_,id,enh_id,shuf
0,chr3,29837526,29837795,bj_k27ac_down_chr3:29837526-29837795,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down,False
1,chr3,29837260,29837529,bj_k27ac_down_chr3:29837260-29837529,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down,False
2,chr3,29837792,29838061,bj_k27ac_down_chr3:29837792-29838061,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down,False
3,chr3,29837392,29837661,bj_k27ac_down_chr3:29837392-29837661,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down,False
4,chr3,29837659,29837928,bj_k27ac_down_chr3:29837659-29837928,chr3,29837260,29838061,bj_k27ac_down,chr3:29837260-29838061|bj_k27ac_down,False
