In [3]:
"""
Filter COMMON MPRA elements from HepG2, K562, WTC11 

Label.

Get cell-type-specific + shared elements 
    - Bedtools intersect elements together.

Intersect w/ FANTOM

Turn into .fa

Quantify GC dinucleotide content. 

Find cCRE centers -> run through sei
    
"""
from Bio.SeqUtils import gc_fraction

import config_readwrite as crw
import glob
import gzip

import numpy as np
import os
import pandas as pd
import pybedtools as pbt
import sys
import subprocess as sp
import time

# config

## load

In [4]:
config_tag = "config.ini"
config, cfn = crw.read(os.path.join(os.getcwd(), config_tag))

## add ccre section

In [5]:
SECTION = "MPRA_AGARWAL"

crw.check(config, SECTION)

# write

# source datapath
SRC = "/wynton/home/ahituv/fongsl/MPRA/agarwal_2023"

# add cell lines string
config[SECTION]["joint_library"] = os.path.join(SRC, "joint_library.csv")
config[SECTION]["all_cell_types_summary"] = os.path.join(SRC, "all_cell_types_summary.csv")

# add source path
config[SECTION]["src"] = SRC

# add output datapath
PATH = "/wynton/home/ahituv/fongsl/nullomers/data/lock"
config[SECTION]["path"] = PATH

# functions

## format data files

In [6]:
def formatLibFile(library_file):
    """
    turn library csv file (sup table 6 of agarwal 2023) into bed-like tsv

    input
        library file (str) - library csv w/ full path

    method
        1. make the file to write
        2. read library file as csv
        3. rename columns as first row. 
        4. get rid of first row w/ column names
        5. reformat "#chr" column
        6. rearrange data as .bed file like, keeping only hg38 annotations
        7. keep only the bed columns, write outfile
        8. make bed file

    return 
        outfile (str) - .tsv w/ hg38 .bedfile coordinates.
        outfile_bed (str) -  hg38 .bedfile coordinates

    """
    # 1
    outfile = library_file.strip(".csv") + '.tsv'
    outfile_bed = library_file.strip(".csv") + '.bed'

    # 2

    df = pd.read_csv(library_file, sep=",")

    # 3 name columns as first row
    df.columns = list(df.iloc[0])

    # 4 get rid of header in first row
    df = df[1:]

    # 5
    df["#chr"] = "chr" + df["chr.hg38"]

    # 6
    keep_cols = [
        "#chr",
        'start.hg38',
        'stop.hg38',
        'str.hg38',
        'name',
        'category',
        "230nt sequence (15nt 5' adaptor - 200nt element - 15nt 3' adaptor)"
    ]
    # 7            ]
    keep = df[keep_cols].dropna()

    keep.to_csv(outfile, sep='\t', index=False)

    #8

    bedcols = ["#chr",
               'start.hg38',
               'stop.hg38',
               'name', 'str.hg38']
    bed = keep[bedcols].dropna()
    bed[['start.hg38','stop.hg38',]] = bed[['start.hg38','stop.hg38',]].astype(int)
    bed.dropna().to_csv(outfile_bed, sep='\t', index=False)

    return outfile, outfile_bed

In [7]:
def makeLibFa(lib_tsvfile):
    """
    write fa file from library file after trimming 230nt hg38 sequence to 200bp test sequence 

    input
        lib_tsvfile (str) - library .tsv file (.bed like)

    method
        1. make outfile.fa, open for writing fa info
        2. parse through library file
        3. trim sequence to remove adaptors. 
        4. make fa name
        5. write row to .fa file

    return
        outfile (str) - path to .fa file

    """
    five_prime, three_prime = set(), set()
    # make .fa file to write
    outfile = lib_tsvfile.strip(".tsv") + ".fa"

    # open the file
    writer = open(outfile, "w")

    # read tsv file
    with open(lib_tsvfile, "r") as reader:
        for line in reader:
            if "#chr" not in line:

                # get info
                chr_, start, end, strand, name, cat, seq = line.strip(
                    "\n").split("\t")

                # trim sequence adaptors
                trim_seq = seq[15:-15]
                five_prime.add(seq[:15]), three_prime.add(seq[-15:])

                # make fa name
                fa_name = f">{chr_}:{start}-{end}"#_{strand}_{name}"
                row = f"{fa_name}\n{trim_seq}\n"

                writer.write(row)

    reader.close(), writer.close()
    print(five_prime, three_prime)
    return outfile

## FANTOM CAGE-seq intersection

In [8]:
# intersect w/ CAGE enhancers 

def fantomIntersection(bedfile, outdir):
    
    print("intersecting bedfile x FANTOM5", bedfile)
    
    CAGE = "/wynton/home/ahituv/fongsl/FANTOM5/F5.hg38.enhancers.bed.gz"

    # make the outfile 
    outfile = os.path.join(outdir, bedfile.strip(".bed") + "_x_FANTOM5.bed")
    
    # make bed objects
    A = pbt.BedTool(bedfile)
    B = pbt.BedTool(CAGE)
    
    # do intersection
    A.intersect(B, wa=True, output=outfile)
    
    return outfile

## liftOver

In [9]:
def liftOver(bedfile, from_build, to_build):
    
    print("liftover from", from_build, "to", "to_build", bedfile)
    
    #1
    SCRIPT = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
    
    #2
    cmd = ["python", 
           SCRIPT, 
           bedfile, 
           "-f", from_build, 
           "-t", to_build
          ]
    
    #3
    print(" ".join(cmd))
    #sp.call(" ".join(cmd), shell=True)
    
    #4
    outfile = bedfile.strip(".bed") + f".liftOver.to.{to_build}.bed"
    
    return outfile

## .bed -> .fa

In [10]:
def bed2fa(bedfile, build):
    """
    convert .bed -> .fa using custom script (which uses bedtools' getfasta command)
    
    input
        bedfile (str) - path to filtered bed file
        build (str) - genome build
        
    method
        1. make script variable
        2. construct command list
        3. call command with subprocess
        
    return
        out (str) - resultant .fa file. 
    """
    print(".bed -> .fa", bedfile)
    #1
    SCRIPT = "/wynton/home/ahituv/fongsl/tools/genome/fasta_from_bed.py"
    
    #2
    cmd = ["python", 
           SCRIPT, 
           bedfile, 
           '-b', build,
          ]
    
    #3
    print(" ".join(cmd))
    #sp.call(" ".join(cmd), shell=True)
    
    #4
    out = bedfile.strip(".bed") + ".fa"
    
    return out

## centering bed file and extending from midpoint
- for sei analysis

In [21]:
def centerBed(bedfile):

    # check for header
    is_header = [True if "#chr" in list(
        pd.read_csv(bedfile, sep='\t', nrows=1)) else False]

    if is_header == True:
        df = pd.read_csv(bedfile, sep='\t').dropna()

    else:
        df = pd.read_csv(bedfile, sep='\t', header=None).dropna()

        #chr 	start.hg38 	stop.hg38 	str.hg38 	name
    df.columns = ["#chr", "start", "end", "id",  'strand']
    df = df.loc[df["end"] != "stop.hg38"]
    df[["start", "end"]]=df[["start", "end"]].astype(int)
    df["len"] = df["end"]-df["start"] # calculate the length
    df["center_start"] = df["len"].astype(int).divide(2).astype(
        int) + df["start"]  # find center, int to round,
    
    # plus 1 so that end is zero coordinate (kind of like SNP)
    df["center_end"] = df["center_start"] + 1

    outfile = bedfile.strip(".bed") + ".centered.bed"

    df[["#chr", "center_start", "center_end", "id"]
       ].drop_duplicates().to_csv(outfile, sep='\t', index=False)

    return outfile


def extendBed(bedfile, flanksize):
    """
    expand bed

    require
        bedtools slop
        wynton

    input
        bed (str) - path to bed file
        flanksize (int) - length to extend bed coordinates by

    method 
        1. get genome size. 
        2. make outfile name
        3. bedtools slop command

    return
        out (str) - path to results.bed
    """
    genome = "/wynton/home/ahituv/fongsl/dna/hg38/hg38.chrom.sizes"

    path, file = os.path.split(bedfile)

    outfile = os.path.join(path, file.strip(".bed") +
                           f".ext.{flanksize*2}bp.bed")
    cmd = f"bedtools slop -i {bedfile} -g {genome} -b {flanksize} > {outfile}"
    os.system(cmd)

    return outfile

## sei

In [12]:
def launchSei(bedfile, build):

    SCRIPT = "/wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py"
    
    SEIDIR = os.path.join(os.path.split(bedfile)[0], "sei_predictions")
    
    if os.path.exists(SEIDIR) is False:
        os.mkdir(SEIDIR)
        
    cmd = ['python',
           SCRIPT, 
           bedfile, 
           build, 
           SEIDIR
          ]
    
    sp.call(" ".join(cmd), shell=True)

## make kmer space

In [13]:
def makeKmerSpace(fasta, kmer_len):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/make_element_kmer_space.py"

    cmd = ["python",
           SCRIPT,
           fasta,
           str(kmer_len),
           "5",
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)
    


## map between kmer space and genome coordinates

In [14]:
def mapKmerSpace(fasta, kmer_len, cl, config):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/map_to_kmer_space.py"
    cmd = ["python",
           SCRIPT,
           fasta,
           cl, 
           str(kmer_len),
           "5",
           config
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

## mutagenize kmers

In [15]:
def mutagenizeKmers(cl, kmer_len, nmuts, build, first_order, config):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/mutagenize_kmers.sh"

    cmd = ["qsub",
           SCRIPT,
           cl,
           str(kmer_len),
           str(nmuts),
           build,
           first_order,
           config
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

## predict TFBS w/ FIMO

In [16]:
def fimo(cl, kmer_len, nmuts, config):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/fimo_null.sh"

    cmd = ["qsub",
           SCRIPT,
           cl,
           str(kmer_len),
           str(nmuts),
           config
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

# build datasets for each cell type

In [17]:
from_build, to_build = "hg38", 'hs1'

In [18]:
# add cell line specific paths, bedfiles.
filtered_files = []

LIB = config[SECTION]["joint_library"]

# add section
SECTIONCL = "common"

crw.check(config, SECTIONCL)

# format library.csv file -> .tsv, .bed
TSV, BED = formatLibFile(LIB)
config[SECTIONCL][f"tsv_{from_build}"] = TSV
config[SECTIONCL][f"bed_{from_build}"] = BED

# make fa file (hg38)
FA = makeLibFa(TSV)
config[SECTIONCL][f"fasta_{from_build}"] = FA

{'AGGACCGGATCAACT'} {'CATTGCGTGAACCGA'}


In [19]:
PATH

'/wynton/home/ahituv/fongsl/nullomers/data/lock'

In [22]:
# add data output path
PATH_CL = os.path.join(PATH,  "processed")
config[SECTIONCL]["path"] = PATH_CL

###
# hg38
###

# center and extend filtered.hg38.bed for sei
CENTERED_hg38 = centerBed(BED)
config[SECTIONCL][f"bed_centered_{from_build}"] = CENTERED_hg38

EXTENDED_hg38 = extendBed(CENTERED_hg38, 2048)  # extend 2048 bases in each direction for sei
config[SECTIONCL][f"bed_extended_{from_build}"] = EXTENDED_hg38

# center and extend filtered.hg38.bed for sei
FA_EXTENDED_hg38 = bed2fa(EXTENDED_hg38, from_build)
config[SECTIONCL][f"fa_extended_{from_build}"] = FA_EXTENDED_hg38

# FANTOM overlap
BED_X_FANTOM = fantomIntersection(BED, PATH_CL)
config[SECTIONCL]["xFANTOM"] = BED_X_FANTOM
  

.bed -> .fa /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.center.ext.4096bp.bed
python /wynton/home/ahituv/fongsl/tools/genome/fasta_from_bed.py /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.center.ext.4096bp.bed -b hg38
intersecting bedfile x FANTOM5 /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.bed


In [23]:
###
# HS1
###

# LiftOver - to make kmer space. 

LIFTED = liftOver(BED, from_build, to_build)
config[SECTIONCL][f"bed_{to_build}"] = LIFTED
    
# .bed -> .fa for kmer space
FA_LIFTED = bed2fa(LIFTED, to_build)
config[SECTIONCL][f"fasta_{to_build}"] = FA_LIFTED
    
# center and extend filtered.hs1.bed for sei
CENTERED = centerBed(LIFTED)
config[SECTIONCL][f"bed_centered_{to_build}"] = CENTERED

    
EXTENDED = extendBed(CENTERED, 2048)  # extend 2048 bases in each direction for sei
config[SECTIONCL][f"bed_extended_{to_build}"] = EXTENDED

FA_EXTENDED = bed2fa(EXTENDED, to_build)
config[SECTIONCL][f"fa_extended_{to_build}"] = FA_EXTENDED


crw.write(config, cfn)

liftover from hg38 to to_build /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.bed
python /wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.bed -f hg38 -t hs1
.bed -> .fa /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.bed
python /wynton/home/ahituv/fongsl/tools/genome/fasta_from_bed.py /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.bed -b hs1
.bed -> .fa /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.center.ext.4096bp.bed
python /wynton/home/ahituv/fongsl/tools/genome/fasta_from_bed.py /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.center.ext.4096bp.bed -b hs1


In [22]:
FA_LIFTED = '/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/joint_library.liftOver.to.hs1.fa'    
# make kmer spaces and mutagenize
CL = "common"
NMUTS = 2
to_build = 'hs1'
for len_k in np.arange(11, 24):
    if len_k ==15:
        # quantify kmer space
        #makeKmerSpace(FA_LIFTED, len_k)
    
        # map kmer space
        # mapKmerSpace(FA_LIFTED, len_k, CL, cfn)
    
        FO = "True" # keep only first order? 
        
        # mutagenize to create nullomers.  
        mutagenizeKmers(CL, len_k, NMUTS, to_build, FO, cfn)
        
        # TFBS motif/ binding prediction
        #fimo(CL, len_k, NMUTS, cfn)
    
    # launch Sei predictions - runs one at a time. Might be best to parallelize in the future. 
    # launchSei(EXTENDED, from_build)
    
    

qsub /wynton/home/ahituv/fongsl/nullomers/bin-lock/mutagenize_kmers.sh common 15 2 hs1 True /wynton/home/ahituv/fongsl/nullomers/bin-lock/config.ini
Your job 9242317 ("mutagenize_kmers.sh") has been submitted


In [None]:
FA_EXTENDED

In [None]:
9292835/(58593973 + 55384369 + 55406747 + 58350816),(58593973 + 55384369 + 55406747 + 58350816)

In [None]:
4**14, 9292835/(4**14)