In [1]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw

import glob
import gzip
import numpy as np
import os, sys
import pandas as pd
import subprocess as sp
import time

# find coordinates of kmers

In [2]:
CL, MER, NMUTS, BUILD, FO_ONLY = "common", "15", "2", "hg38", True
config, cfn = crw.read(os.path.join(os.getcwd(), "config.ini"))

In [3]:
TSV_MUT= config[f"{CL}.{MER}mer.{NMUTS}mut"]["concat-10"]

BED = config[CL][f"bed_{BUILD}"]
FA_EXT_BED = config[CL][f"fa_extended_{BUILD}"]
EXTENDED = config[CL][f"bed_extended_{BUILD}"]
FA = config[CL][f"fasta_{BUILD}"]


In [4]:
def faToDict(fa_file):
    """
    turn all the fa coordinates into a dictionary. 
    """

    fa_dict = {}
    with open(fa_file, "r") as fasta_reader:
        for values in SimpleFastaParser(fasta_reader):
            
            name, seq = values
            fa_dict[name] = seq
            
    return fa_dict

In [5]:
def dissectFaId(line):
    """
    dissect mutated fa id line for kmer, nullomer information, and get the coordinate file
    """
    # process the line
    kmer, chr_, coor, strand, id_ = line.strip('\n').strip(">").split(".")
            
    # get key and value sequence to look up
    key, value_seq = kmer[:5], kmer[5:]

    bed_coor = chr_ + ":" + coor
    
    region_str = ".".join([kmer, bed_coor, strand, id_]) #TACCCATTCGAGTC.chr1.100333924-100334124.fwd
    
    
    return kmer, key, value_seq, bed_coor, region_str

In [6]:
def seqInsert(seq, insert_start, insert_seq):
    """
    insert sequence fragment (insert_seq) at position (insert_start) within full sequence (seq)
    
    return inserted sequence. 
    """

    insert_size = len(insert_seq)
    
    insert_end = insert_start + insert_size  # find center end

    return seq[:int(insert_start)] + insert_seq + seq[int(insert_end):]

In [7]:
def MPRAInsertNull(null, fa_sequence):
    
    """
    return positions where the nullomer is in fa sequence
    """
    insert_start = (len(fa_sequence)- len(null))/2
        

    null_insert = seqInsert(fa_sequence.lower(), insert_start, null.upper())  # insert the nullomer sequence into the sequence
    
    return null_insert

In [8]:
def mapBed(bedfile, expandedfile):

   
    bed = pd.read_csv(bedfile, sep='\t')
    bed.columns = names =["#chr", "start", "end", "strand", "id"]
    bed["coor"] = bed["#chr"] +":"+bed["start"].map(str) + "-" + bed["end"].map(str)


    ext = pd.read_csv(expandedfile, sep='\t', header=None, names =["#chr_e", "start_e", "end_e", "id"])
    ext["coor_e"] = ext["#chr_e"] +":"+ext["start_e"].map(str) + "-" + ext["end_e"].map(str)

    joint = pd.merge(ext[["id", "coor_e"]], bed[["id", "coor"]])
    
    return joint

In [9]:
def insert4096(nulls_dict, joint_bedex, fa_ex_dict, coor):
    
    extended_nullseqs = {}
    
    for key, null_seq in nulls_dict.items():
        
        extended_coor = joint_bedex.loc[joint_bedex["coor"]==coor, "coor_e"].iloc[0] # extend the coordinates to 4096
        
        extended_seq = fa_ex_dict[extended_coor]  # get the fasta file linked to the extended coordinates
        
        insert_start = len(extended_seq)/2 - 100 # get the start to insert  # figure out where to insert the sequence
        
        null4096 = seqInsert(extended_seq.lower(), insert_start, null_seq.upper())  # insert the 200bp MPRA tile w/ nullomer variants into sequence
        
        extended_nullseqs[key]=null4096  # add to the collection dictionary along with original key
        
    
    return extended_nullseqs, extended_seq

In [10]:
def prettifySeq(original, mut):
    """
    prettify sequence. 
        All matching bases will be written as "."
        All non matching bases will be written w mutated base identity. 
    """
    prettyseq = ""
    for o, m in zip(original, mut):
        if o != m:
            prettyseq += m
        else:
            prettyseq += "."

    return prettyseq

# need to cross over from hs1 to hg38
find coordinates in hs1, map back to hg38, and perform sequence scan/ insertion on hg38 build

# scaffold selection

Scaffold selection? 

See: ./nullomers/bin-lock/Analysis/mpra_corr_scaffold.ipynb

Basically, 60K sequences commonly active|inactive 
across 3 cell lines from Agarwal 2023 were evaluated 
for MPRA activity (log(rna/dna)). I quantified the mean 
activity across these three cell lines, then selected 
the top 3 active and inactive sequences based on the 
ranked mean. 

## active 

        name  HepG2 [log2(rna/dna)]  \
        20512                  K562_peak79186                  3.581   
        54878                 ENSG00000180198                  2.774   
        53330  HepG2_DNasePeakNoPromoter13177                  3.786   
        31584                  K562_peak30340                  3.169   
        55371                 ENSG00000156508                  3.668   

               K562 [log2(rna/dna)]  WTC11 [log2(rna/dna)]  
        20512                 4.780                  3.674  
        54878                 3.669                  5.110  
        53330                 4.569                  4.022  
        31584                 4.017                  4.591  
        55371                 4.151                  6.533  
                                         name                  category 
        23842                  K562_peak30340   putative enhancer, K562 
        35593                  K562_peak79186   putative enhancer, K562 
        38931  HepG2_DNasePeakNoPromoter13177  putative enhancer, HepG2 
        58186                 ENSG00000156508                  promoter 
        58694                 ENSG00000180198                  promoter 

               chr.hg38  start.hg38   stop.hg38  \
        23842  16  22922244.0  22922444.0   
        35593   8  41156753.0  41156953.0   
        38931  12  50933777.0  50933977.0   
        58186   6  73520947.0  73521147.0   
        58694   1  28505946.0  28506146.0   

              str.hg38  \
        23842        +   
        35593        +   
        38931        +   
        58186        -   
        58694        +   


## inactive


                    name  HepG2 [log2(rna/dna)]  \
    17644                WTC11_seq11545_F                 -2.196   
    54971                 ENSG00000174521                 -2.043   
    5529                  WTC11_seq6263_F                 -2.113   
    51378  HepG2_DNasePeakNoPromoter19554                 -2.123   

           K562 [log2(rna/dna)]  WTC11 [log2(rna/dna)]  
    17644                -2.246                 -2.776  
    54971                -2.306                 -2.808  
    5529                 -2.097                 -2.385  
    51378                -2.002                 -2.380  

                                     name                  category 
    396                  WTC11_seq11545_F  putative enhancer, WTC11 
    13182                 WTC11_seq6263_F  putative enhancer, WTC11 
    40971  HepG2_DNasePeakNoPromoter19554  putative enhancer, HepG2 
    58596                 ENSG00000174521                  promoter 


            chr.hg38   start.hg38    stop.hg38  \
    396           10  133080140.0  133080340.0   
    13182          1  206663895.0  206664095.0   
    40971         15   45387502.0   45387702.0   
    58596         19   40218269.0   40218469.0   
    59562  

          str.hg38  \
    396          +   
    13182        +   
    40971        +   
    58596        -   


## pick active coordinates

In [18]:
ACTIVE_COOR = [#"chr16:22922244-22922444", 
               #"chr8:41156753-41156953", 
               #"chr12:50933777-50933977", 
               "chr6:73520947-73521147",
               #"chr1:28505946-28506146"
              ]
INACTIVE_COOR = [#"chr10:133080140-133080340",
                 #"chr1:206663895-206664095",
                 #"chr15:45387502-45387702",
                # "chr19:40218269-40218469"
                ]


fa_dict = faToDict(FA)
fa_ex_dict = faToDict(FA_EXT_BED)
joint_bedex=mapBed(BED, EXTENDED)

ACTIVE_COOR.extend(INACTIVE_COOR)

ACTIVE_COOR

['chr6:73520947-73521147']

# run 

In [19]:
def visualInspection(kmer, null, null_ins200, inserts4096, original_seq):
    """
    print a bunch of information related to how this script is 
    inserting kmers into 200bp regions, 
    inserting those 200bp regions into 4096mers
    """
    
    print("\nkmer\n", kmer)
    
    print("\nthe nullomer sequence\n",
          null, "\n\ninserted\n", null_ins200)
    
    print("\n4096 w/ null sequence\n", inserts4096)
    
    print("\n pretty 4096", prettifySeq(
        original_seq.upper(), list(inserts4096.values())[0].upper()))

In [20]:
TSV_MUT

'/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.CONCAT-10.tsv'

In [21]:
last_coor = ""
wrote200, wrote4096, issues = [], [], []
F = TSV_MUT

# write files per coor
for coor in ACTIVE_COOR:
                
    with open(F, "r") as reader:
        for seq_id, line in enumerate(reader):
    
            if "nullConcat" not in line:
                null, kmer, gcnull, gcnull, null_list = line.strip(
                "\n").split('\t')
            
                # a new scaffold
                if last_coor != coor:
                    last_coor, v = coor, 0 # set last coordinate value
                    
                    # open writer files
                    writer200 = open(F.strip(".tsv") +
                                  f".ext200.{coor}.fa", "w")
                    writer4096 = open(F.strip(".tsv") +
                                  f".ext4096.{coor}.fa", "w")
                    
                    print("new coor", coor, "\n")
                    
                # get the fa sequence for the coordinate
                region_fa = fa_dict[coor]
                
                # make an id from the coordinate and the sequence index
                region_id = coor + "_" + str(seq_id)

                # insert nulls, kmers into center of 200bp MPRA sequences.
                null_ins200 = MPRAInsertNull(
                    null, region_fa)  # insert null concatemer
                
                kmers_ins200 = MPRAInsertNull(
                    kmer, region_fa)  # insert kmer concatemer

                # dictionary for inserting 200bp into 4096bp sequence
                inserts200 = {
                    region_id+"_concat-null": null_ins200,
                    region_id+"_concat-kmer": kmers_ins200,
                    
                }

                # insert 200bp null sequence into the 4096 sequence
                inserts4096, original_seq = insert4096(inserts200, joint_bedex, fa_ex_dict, coor)

                if v == 0:

                    # write the original sequences
                    
                    writer200.write(f"{coor}_concat-noinsert\n{region_fa}\n")
                    writer4096.write(f"{coor}_concat-noinsert\n{original_seq}\n")
                    
                # write 200mers
                for seqname, seq in inserts200.items():
                    
                    # filter duplicates
                    if seqname not in wrote200:
                        writer200.write(f">{seqname}\n{seq.upper()}\n")
                        wrote200.append(seqname)
                        
                # write 4096 nullomers
                if len(original_seq) >= 4095:
                    
                    for seqname, seq in inserts4096.items():

                        # filter duplicates
                        if seqname not in wrote4096:
                            writer4096.write(f">{seqname}\n{seq.upper()}\n")
                            wrote4096.append(seqname)
                    v += 1

                else:
                    # print("ISSUES", name, region_str, len(original_seq))
                    issues.append(region_id)
                    continue


    writer200.close(), writer4096.close()

new coor chr6:73520947-73521147 



In [None]:
len(issues)/32

In [None]:
partial

# predict w sei

In [24]:
def launchSei(file, build, qsub):
    
    SEIDIR = os.path.join(os.path.split(file)[0], "sei_predictions")
    
    if os.path.exists(SEIDIR) is False:
        os.mkdir(SEIDIR)
    print(SEIDIR)
    
    if qsub is True:
        SCRIPT = "/wynton/home/ahituv/fongsl/nullomers/bin-lock/interpret_sei_array.sh"
        cmd = ['qsub',
           SCRIPT, 
           build, 
           SEIDIR
          ]
        
    else:
        SCRIPT = "/wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py"

        cmd = ['python',
           SCRIPT, 
           file, 
           build, 
           SEIDIR
          ]
    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

## launch sei

In [22]:
fs = glob.glob(f"/wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.{MER}mers.2mut.nulls.fo.pam.CONCAT-10.ext4096.*.fa")
print(len(fs))

7


In [None]:
qsub = False
for n, out in enumerate(fs):
    launchSei(out, "hs1", qsub)

/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
python /wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py /wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.CONCAT-10.ext4096.chr16:22922244-22922444.fa hs1 /wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 3134963 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
python /wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py /wynton/home/ahituv/fongsl/nullomers/data/lock/common/common.15mers.2mut.nulls.fo.pam.CONCAT-10.ext4096.chr8:41156753-41156953.fa hs1 /wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
Your job 3134977 ("interpret_sei.sh") has been submitted
/wynton/home/ahituv/fongsl/nullomers/data/lock/common/sei_predictions
python /wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py /wynton/home/ahituv/fong

In [18]:
names = []
with open(out, "r") as reader:
    for value in SimpleFastaParser(reader):
        name, seq = value
        if "chr10:133080140-133080340" in name:
            print(name,"\n")

chr10:133080140-133080340_1_concat-null 

chr10:133080140-133080340_1_concat-kmer 

chr10:133080140-133080340_1_concat-null 

chr10:133080140-133080340_1_concat-kmer 

chr10:133080140-133080340_2_concat-null 

chr10:133080140-133080340_2_concat-kmer 

chr10:133080140-133080340_2_concat-null 

chr10:133080140-133080340_2_concat-kmer 

chr10:133080140-133080340_3_concat-null 

chr10:133080140-133080340_3_concat-kmer 

chr10:133080140-133080340_3_concat-null 

chr10:133080140-133080340_3_concat-kmer 

chr10:133080140-133080340_4_concat-null 

chr10:133080140-133080340_4_concat-kmer 

chr10:133080140-133080340_4_concat-null 

chr10:133080140-133080340_4_concat-kmer 

chr10:133080140-133080340_5_concat-null 

chr10:133080140-133080340_5_concat-kmer 

chr10:133080140-133080340_5_concat-null 

chr10:133080140-133080340_5_concat-kmer 

chr10:133080140-133080340_6_concat-null 

chr10:133080140-133080340_6_concat-kmer 

chr10:133080140-133080340_6_concat-null 

chr10:133080140-133080340_6_concat