In [42]:
"""
Filter ENCODE cCREs from HepG2, K562, 293T for pELS, dELS, PLS +/- CTCF elements

Label.

Get cell-type-specific + shared elements 
    - Bedtools intersect elements together.

Intersect w/ FANTOM

Turn into .fa

Quantify GC dinucleotide content. 

Find cCRE centers -> run through sei
    
"""
from Bio.SeqUtils import gc_fraction

import config_readwrite as crw
import glob
import gzip

import numpy as np
import os
import pandas as pd
import pybedtools as pbt
import sys
import subprocess as sp
import time

# config

## load

In [2]:
config_tag = "config.ini"
config, cfn = crw.read(os.path.join(os.getcwd(), config_tag))

## add ccre section

In [3]:
SECTION = "CCRE"

crw.check(config, SECTION)

# write

# source datapath
SRC = "/wynton/home/ahituv/fongsl/encode"

# cell lines
CLS = [  # "293t",
    "k562", "hepg2"]

# add cell lines string
config[SECTION]["cls"] = ",".join(CLS)

# add source path
config[SECTION]["src"] = SRC

# add output datapath
PATH = "/wynton/home/ahituv/fongsl/nullomers/data/lock"
config[SECTION]["path"] = PATH

# functions

## filtering cCREs

In [4]:
def filterCcre(bedfile, CL, outdir):
    """
    filter cCRE file for proximal/distal enhancer elements, promoters +/- CTCF

    input
        bedfile (str) - cCRE .bed file for one cell line
        CL (str) - name of cell line
        outdir (str) - path to output directory
        
    require
        pandas

    method
        1. Open bed file
        2. filter bed coordinates based on annotations
        
            - "dELS" - distal enhancer like signature, 
                determined from z-scoring of DHS plus
                high DNase and H3K27ac signals, 
                more than 2 kb from the nearest TSS
                
            - "pELS" - proximal enhancer like signature,
                determined from z-scoring of DHS plus
                within 2 kb of a TSS, 
                high DNase and H3K27ac signals, 
                low relative H3K4me3 signal
                
            - "PLS" - promoter enhancer like signature, 
                determined from z-scoring of DHS plus
                falls within 200 bp (centre-to-centre) 
                of an annotated GENCODE TSS that has high DNase and H3K4me3 signals.
                
            - more info? All of this was taken from Box 1. See - https://doi.org/10.1038/s41586-020-2493-4
            
        3. annotate w CL
        4. make the outdir if not already made. 

    return 
        (str) - path to filtered, indexed cl-specific cCRE coordinates

    """
    print("filtering cCREs for dels, pels, pls")
    
    # 1 open bed file, keep only bed coordinate columns, encodeId, and feature label
    df = pd.read_csv(bedfile, sep='\t',
                     header=None, usecols=[0, 1, 2, 3, 9],
                     names=["#chr", "start", "end", "encodeId", "label"]
                     )
    print("source df", df.shape)


    #2 filter dataframe for labels
    filtered = df.loc[df["label"].isin([
        'PLS',
        'PLS,CTCF-bound',
        'dELS',
        'dELS,CTCF-bound',
        'pELS',
        'pELS,CTCF-bound'
    ]
    )]

    print("filtered", filtered.shape)

    # 3 annotate CL-id
    filtered["CL-id"] = CL + "-" + filtered["encodeId"]

    # 4
    if os.path.exists(outdir) is False:
        os.mkdir(outdir)

    # 5 write file
    outfile = os.path.join(outdir, f"filtered.{CL}.cCRE.bed")
    
    filtered.to_csv(outfile, sep='\t', index=False)
    
    # return filtered file
    
    return outfile

## FANTOM CAGE-seq intersection

In [5]:
# intersect w/ CAGE enhancers 

def fantomIntersection(bedfile, CL, outdir):
    
    print("intersecting bedfile x FANTOM5", bedfile)
    
    CAGE = "/wynton/home/ahituv/fongsl/FANTOM5/F5.hg38.enhancers.bed.gz"

    # make the outfile 
    outfile = os.path.join(outdir, bedfile.strip(".bed") + "_x_FANTOM5.bed")
    
    # make bed objects
    A = pbt.BedTool(bedfile)
    B = pbt.BedTool(CAGE)
    
    # do intersection
    A.intersect(B, wa=True, output=outfile)
    
    return outfile

## liftOver

In [6]:
def liftOver(bedfile, from_build, to_build):
    
    print("liftover from", from_build, "to", "to_build", bedfile)
    
    #1
    SCRIPT = "/wynton/home/ahituv/fongsl/tools/evo/liftover_bed-wynton.py"
    
    #2
    cmd = ["python", 
           SCRIPT, 
           bedfile, 
           "-f", from_build, 
           "-t", to_build
          ]
    
    #3
    print(" ".join(cmd))
    #sp.call(" ".join(cmd), shell=True)
    
    #4
    outfile = bedfile.strip(".bed") + f".liftOver.to.{to_build}.bed"
    
    return outfile

## .bed -> .fa

In [7]:
def bed2fa(bedfile, build):
    """
    convert .bed -> .fa using custom script (which uses bedtools' getfasta command)
    
    input
        bedfile (str) - path to filtered bed file
        build (str) - genome build
        
    method
        1. make script variable
        2. construct command list
        3. call command with subprocess
        
    return
        out (str) - resultant .fa file. 
    """
    print(".bed -> .fa", bedfile)
    #1
    SCRIPT = "/wynton/home/ahituv/fongsl/tools/genome/fasta_from_bed.py"
    
    #2
    cmd = ["python", 
           SCRIPT, 
           bedfile, 
           '-b', build,
          ]
    
    #3
    print(" ".join(cmd))
    #sp.call(" ".join(cmd), shell=True)
    
    #4
    out = bedfile.strip(".bed") + ".fa"
    
    return out

## centering bed file and extending from midpoint
- for sei analysis

In [8]:
def centerBed(bedfile):

    # check for header
    is_header = [True if "#chr" in list(
        pd.read_csv(bedfile, sep='\t', nrows=1)) else False]

    if is_header == True:
        df = pd.read_csv(bedfile, sep='\t')

    else:
        df = pd.read_csv(bedfile, sep='\t', header=None)
        df.columns = ["#chr", "start", "end", 'encodeId', "label", "CI_label"]

    df["len"] = df["end"]-df["start"]  # calculate the length
    df["center_start"] = df["len"].divide(2).astype(
        int) + df["start"]  # find center, int to round,
    
    # plus 1 so that end is zero coordinate (kind of like SNP)
    df["center_end"] = df["center_start"] + 1

    outfile = bedfile.strip(".bed") + ".centered.bed"

    df[["#chr", "center_start", "center_end", "encodeId"]
       ].drop_duplicates().to_csv(outfile, sep='\t', index=False)

    return outfile


def extendBed(bedfile, flanksize):
    """
    expand bed

    require
        bedtools slop
        wynton

    input
        bed (str) - path to bed file
        flanksize (int) - length to extend bed coordinates by

    method 
        1. get genome size. 
        2. make outfile name
        3. bedtools slop command

    return
        out (str) - path to results.bed
    """
    genome = "/wynton/home/ahituv/fongsl/dna/hg38/hg38.chrom.sizes"

    path, file = os.path.split(bedfile)

    outfile = os.path.join(path, file.strip(".bed") +
                           f".ext.{flanksize*2}bp.bed")
    cmd = f"bedtools slop -i {bed} -g {genome} -b {flanksize} > {outfile}"
    os.system(cmd)

    return outfile

## sei

In [9]:
def launchSei(bedfile, build):

    SCRIPT = "/wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py"
    
    SEIDIR = os.path.join(os.path.split(bedfile)[0], "sei_predictions")
    
    if os.path.exists(SEIDIR) is False:
        os.mkdir(SEIDIR)
        
    cmd = ['python',
           SCRIPT, 
           bedfile, 
           build, 
           SEIDIR
          ]
    
    sp.call(" ".join(cmd), shell=True)

## make kmer space

In [36]:
def makeKmerSpace(fasta, kmer_len):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/make_kmer_space.py"

    cmd = ["python",
           SCRIPT,
           fasta,
           str(kmer_len),
           "5",
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

## mutagenize kmers

In [49]:
def mutagenizeKmers(cl, kmer_len, nmuts, build):
    SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/mutagenize_kmers.sh"

    cmd = ["qsub",
           SCRIPT,
           cl,
           str(kmer_len),
           str(nmuts),
           build,
           ]

    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

# build datasets for each cell type

In [None]:
# add cell line specific paths, bedfiles.
filtered_files = []

for CL in CLS:

    # add section
    SECTIONCL = SECTION + "_" + CL

    crw.check(config, SECTIONCL)

    # add path
    SRC_PATH_CL = os.path.join(SRC, CL)
    config[SECTIONCL]["src_path"] = SRC_PATH_CL

    # add data output path
    PATH_CL = os.path.join(PATH, CL)
    config[SECTIONCL]["path"] = PATH_CL

    # add bed
    BED = glob.glob(os.path.join(SRC_PATH_CL, CL + "*ENCFF*.bed"))[0]
    config[SECTIONCL]["bed"] = BED
    
    # add filtered bed
    FILTEREDBED = filterCcre(BED, CL, PATH_CL)
    config[SECTIONCL]["bed_filtered"] = FILTEREDBED
    
    filtered_files.append((CL, FILTEREDBED))
    
    # FANTOM overlap
    BED_X_FANTOM = fantomIntersection(FILTEREDBED, CL, PATH_CL)
    config[SECTIONCL]["xFANTOM"] = BED_X_FANTOM
    
    # LiftOver - to make kmer space. 
    from_build, to_build = "hg38", 'hs1'
    LIFTED = liftOver(FILTEREDBED, from_build, to_build)
    config[SECTIONCL]["lifted_hs1"] = LIFTED
    
    # .bed -> .fa for kmer space
    FASTA = bed2fa(LIFTED, to_build)
    config[SECTIONCL]["hs1_fasta"] = FASTA
    
    # center and extend filtered.hg38.bed for sei
    CENTERED = centerBed(bedfile)
    config[SECTIONCL]["bed_centered"] = CENTERED
    
    EXTENDED = extendBed(CENTERED, 2048)  # extend 2048 bases in each direction for sei
    config[SECTIONCL]["bed_centered"] = EXTENDED
    
    # launch Sei predictions - runs one at a time. Might be best to parallelize in the future. 
    # launchSei(EXTENDED, from_build)
    
    # make kmer spaces
    for len_k in np.arange(11, 24):
        makeKmerSpace(FASTA, len_k)
    
    # mutagenize to create nullomers.  
    
    # conservation
    
    # TFBS motif/ binding prediction
    
    

# find overlap between these sequences

In [None]:
FASTA="/wynton/home/ahituv/fongsl/nullomers/data/lock/hepg2/filtered.hepg2.cCRE.liftOver.to.hs1.fa"
for len_k in np.arange(11, 24):
    makeKmerSpace(FASTA, len_k)
    break

In [None]:
#crw.checkOpt(config, "11mer", "results")

# mutagenize kmers

In [56]:
CL = "hepg2"
KMER_LEN = 14
NMUTS = 2
BUILD = "hs1"

mutagenizeKmers(CL, KMER_LEN, NMUTS, BUILD)

qsub /wynton/home/ahituv/fongsl/nullomers/bin-lock/mutagenize_kmers.sh hepg2 14 2 hs1
Your job 2423269 ("mutagenize_kmers.sh") has been submitted
