In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw

import glob
import gzip
import numpy as np
import os, sys
import pandas as pd
import subprocess as sp

# find coordinates of kmers

In [2]:
CL, MER, NMUTS, BUILD, FO_ONLY = "common", "15", "2", "hs1", True
config, cfn = crw.read(os.path.join(os.getcwd(), "config.ini"))

In [7]:
FA_MUT =  config[f"common.{MER}mer.{NMUTS}mut"]["fa"]
TSV_MUT =  config[f"common.{MER}mer.{NMUTS}mut"]["tsv"]
#TSV_MUT= '/wynton/home/ahituv/fongsl/nullomers/data/lock/common/_common.14mers.2mut.nulls.fo.tsv'
#FA_MUT = '/wynton/home/ahituv/fongsl/nullomers/data/lock/common/_common.14mers.2mut.nulls.fo.fa'

BED = config[CL]["bed_hs1"]
FA_EXT_BED = config[CL]["fa_extended_hs1"]
EXTENDED = config[CL]["bed_extended_hs1"]
FA = config[CL]["fasta_hs1"]

In [8]:
def faToDict(fa_file):
    """
    turn all the fa coordinates into a dictionary. 
    """

    fa_dict = {}
    with open(fa_file, "r") as fasta_reader:
        for values in SimpleFastaParser(fasta_reader):
            
            name, seq = values
            fa_dict[name] = seq
            
    return fa_dict

In [9]:
def dissectFaId(line):
    """
    dissect mutated fa id line for kmer, nullomer information, and get the coordinate file
    """
    # process the line
    kmer, chr_, coor, strand, id_ = line.strip('\n').strip(">").split(".")
            
    # get key and value sequence to look up
    key, value_seq = kmer[:5], kmer[5:]

    bed_coor = chr_ + ":" + coor
    
    region_str = ".".join([kmer, bed_coor, strand, id_]) #TACCCATTCGAGTC.chr1.100333924-100334124.fwd
    
    
    return kmer, key, value_seq, bed_coor, region_str

In [10]:
def seqInsert(seq, insert_start, insert_seq):
    """
    insert sequence fragment (insert_seq) at position (insert_start) within full sequence (seq)
    
    return inserted sequence. 
    """

    insert_size = len(insert_seq)
    
    insert_end = insert_start + insert_size  # find center end

    return seq[:int(insert_start)] + insert_seq + seq[int(insert_end):]

In [11]:
def findKmerInsertNull(kmer, null, fa_sequence, region_id):
    
    """
    return positions where the nullomer is in fa sequence
    """
    pos = {}
    kmerlen = len(kmer)
        
    nwindows = len(fa_sequence) - kmerlen + 1

    for n in np.arange(nwindows):

        start, stop = n, n+kmerlen
        snippet = fa_sequence[start:stop]
        if kmer.upper() == snippet.upper():
            null_insert = seqInsert(fa_sequence.lower(), start, null.upper())  # insert the nullomer sequence into the sequence

            pos[f"{region_id}.{start}"] = null_insert
    return pos

In [12]:
def mapBed(bedfile, expandedfile):

    bed = pd.read_csv(bedfile, sep='\t', header=None, names =["#chr", "start", "end", "strand", "id"])  
    bed["coor"] = bed["#chr"] +":"+bed["start"].map(str) + "-" + bed["end"].map(str)

    ext = pd.read_csv(expandedfile, sep='\t', header=None, names =["#chr_e", "start_e", "end_e", "id"])
    ext["coor_e"] = ext["#chr_e"] +":"+ext["start_e"].map(str) + "-" + ext["end_e"].map(str)

    joint = pd.merge(ext[["id", "coor_e"]], bed[["id", "coor"]])
    
    return joint

In [13]:
def insert4096(nulls_dict, joint_bedex, fa_ex_dict, coor):
    
    extended_nullseqs = {}
    
    for key, null_seq in nulls_dict.items():
        
        extended_coor = joint_bedex.loc[joint_bedex["coor"]==coor, "coor_e"].iloc[0] # extend the coordinates to 4096
        
        extended_seq = fa_ex_dict[extended_coor]  # get the fasta file linked to the extended coordinates
        
        insert_start = len(extended_seq)/2 - 100 # get the start to insert  # figure out where to insert the sequence
        
        null4096 = seqInsert(extended_seq.lower(), insert_start, null_seq.upper())  # insert the 200bp MPRA tile w/ nullomer variants into sequence
        
        extended_nullseqs[key]=null4096  # add to the collection dictionary along with original key
        
    
    return extended_nullseqs, extended_seq

In [14]:
def prettifySeq(original, mut):
    """
    prettify sequence. 
        All matching bases will be written as "."
        All non matching bases will be written w mutated base identity. 
    """
    prettyseq = ""
    for o, m in zip(original, mut):
        if o != m:
            prettyseq += m
        else:
            prettyseq += "."

    return prettyseq

In [16]:
FA_MUT

'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.fa'

In [15]:
fa_dict = faToDict(FA)
fa_ex_dict = faToDict(FA_EXT_BED)
joint_bedex=mapBed(BED, EXTENDED)


v, partial = 0, 0
issues = []
with open(FA_MUT, "r") as fasta_reader:
    out = FA_MUT.strip(".fa") + ".ext4096.fa"
    
    for values in SimpleFastaParser(fasta_reader):
        
        # parse fasta
        name, null = values
        
        # get kmer information
        kmer, key, value_seq, coor, region_id = dissectFaId(name)
        
        if 'rev' in region_id:  # skip reverse elements for now. 
            continue
        else:

            # fa linked to regions (linked to kmer)
            region_fa = fa_dict[coor]

            # get sequences w/ nulls inserted.  
            nulls = findKmerInsertNull(kmer, null, region_fa, region_id)

            # insert 200bp null sequence into the 4096 sequence
            null_inserts, original_seq = insert4096(nulls, joint_bedex, fa_ex_dict, coor)

            if v < 5:
                print("\nkmer\n", kmer, "\nkey\n", key, "\nvalue\n", value_seq)
                print("\nthe nullomer sequence\n", null, "\n\ninserted\n", nulls)
                print("\n4096 w/ null sequence\n", null_inserts)
                print("\n pretty 4096", prettifySeq(original_seq.upper(), list(null_inserts.values())[0].upper()))

                # open writer
                writer = open(FA_MUT.strip(".fa") + f".ext4096.{partial}.fa", "w")

            # write original row
            if len(original_seq)>=4095:

                writer.write(f">{region_id}\n{original_seq.upper()}\n")

                # write nullomers
                for null_seqname, null_seq in null_inserts.items():
                    writer.write(f">{null_seqname}\n{null_seq.upper()}\n")
                v+=1
            else:
                #print("ISSUES", name, region_str, len(original_seq))
                issues.append(region_id)
                continue

            if v%999==0:
                # finish writing the 100th sequence, close the file, then write a new file for the next 100 lines
                writer.close()
                partial +=1
                writer = open(FA_MUT.strip(".fa") + f".ext4096.{partial}.fa", "w")

writer.close()


FileNotFoundError: [Errno 2] No such file or directory: '/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.fa'

In [None]:
partial

# write config

In [28]:
crw.check(config, f"common.{MER}mer")
config[f"common.{MER}mer"]["coor"] = COOR
config[f"common.{MER}mer.{NMUTS}mut"]["ext_fa"]=out

crw.write(config, cfn)

# predict w sei

In [29]:
def launchSei(file, build, qsub):
    
    SEIDIR = os.path.join(os.path.split(file)[0], "sei_predictions")
    
    if os.path.exists(SEIDIR) is False:
        os.mkdir(SEIDIR)
    print(SEIDIR)
    
    if qsub is True:
        SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/interpret_sei_array.sh"
        cmd = ['qsub',
           SCRIPT, 
           build, 
           SEIDIR
          ]
        
    else:
        SCRIPT = "/wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py"

        cmd = ['python',
           SCRIPT, 
           file, 
           build, 
           SEIDIR
          ]
    
    sp.call(" ".join(cmd), shell=True)

## launch sei

In [33]:
fs = glob.glob("/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.14mers.2mut.nulls.fo.ext4096.*.fa")
print(len(fs))

115


In [34]:
qsub = False
for out in fs:
    launchSei(out, "hs1", qsub)

/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596392 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596393 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596394 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596395 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596396 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596397 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596398 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596399 ("interpret_sei.sh") has

Your job 2596456 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596457 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596458 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596459 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596460 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596461 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596462 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 2596463 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/

In [None]:
len(fs)