In [2]:
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw

import glob
import gzip
import numpy as np
import os, sys
import pandas as pd
import subprocess as sp
import time

# find coordinates of kmers

In [3]:
CL, MER, NMUTS, BUILD, FO_ONLY = "common", "15", "2", "hg38", True
config, cfn = crw.read(os.path.join(os.getcwd(), "config.ini"))

In [4]:
TSV_MUT= config[f"common.{MER}mer.{NMUTS}mut"]["tsv"]

BED = config[CL][f"bed_{BUILD}"]
FA_EXT_BED = config[CL][f"fa_extended_{BUILD}"]
EXTENDED = config[CL][f"bed_extended_{BUILD}"]
FA = config[CL][f"fasta_{BUILD}"]


In [5]:
SCAFFOLDS="/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/scaffold-deciles.n150.tsv"
SCAFFOLDS="/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/scaffold-max_min.n150.tsv"
SAMPLE_ROOT = os.path.splitext(SCAFFOLDS)[0]

In [6]:
def faToDict(fa_file):
    """
    turn all the fa coordinates into a dictionary. 
    """

    fa_dict = {}
    with open(fa_file, "r") as fasta_reader:
        for values in SimpleFastaParser(fasta_reader):
            
            name, seq = values
            fa_dict[name] = seq
            
    return fa_dict

In [7]:
def dissectFaId(line):
    """
    dissect mutated fa id line for kmer, nullomer information, and get the coordinate file
    """
    # process the line
    kmer, chr_, coor, strand, id_ = line.strip('\n').strip(">").split(".")
            
    # get key and value sequence to look up
    key, value_seq = kmer[:5], kmer[5:]

    bed_coor = chr_ + ":" + coor
    
    region_str = ".".join([kmer, bed_coor, strand, id_]) #TACCCATTCGAGTC.chr1.100333924-100334124.fwd
    
    
    return kmer, key, value_seq, bed_coor, region_str

In [8]:
def seqInsert(seq, insert_start, insert_seq):
    """
    insert sequence fragment (insert_seq) at position (insert_start) within full sequence (seq)
    
    return inserted sequence. 
    """

    insert_size = len(insert_seq)
    
    insert_end = insert_start + insert_size  # find center end

    return seq[:int(insert_start)] + insert_seq + seq[int(insert_end):]

In [9]:
def MPRAInsertNull(null, fa_sequence):
    
    """
    return positions where the nullomer is in fa sequence
    """
    insert_start = (len(fa_sequence)- len(null))/2
        

    null_insert = seqInsert(fa_sequence.lower(), insert_start, null.upper())  # insert the nullomer sequence into the sequence
    
    return null_insert

In [10]:
def mapBed(bedfile, expandedfile):

   
    bed = pd.read_csv(bedfile, sep='\t')
    bed.columns = names =["#chr", "start", "end", "strand", "id"]
    bed["coor"] = bed["#chr"] +":"+bed["start"].map(str) + "-" + bed["end"].map(str)


    ext = pd.read_csv(expandedfile, sep='\t', header=None, names =["#chr_e", "start_e", "end_e", "id"])
    ext["coor_e"] = ext["#chr_e"] +":"+ext["start_e"].map(str) + "-" + ext["end_e"].map(str)

    joint = pd.merge(ext[["id", "coor_e"]], bed[["id", "coor"]])
    
    return joint

In [11]:
def insert4096(nulls_dict, joint_bedex, fa_ex_dict, coor):
    
    extended_nullseqs = {}
    
    for key, null_seq in nulls_dict.items():
        
        extended_coor = joint_bedex.loc[joint_bedex["coor"]==coor, "coor_e"].iloc[0] # extend the coordinates to 4096
        
        extended_seq = fa_ex_dict[extended_coor]  # get the fasta file linked to the extended coordinates
        
        insert_start = len(extended_seq)/2 - 100 # get the start to insert  # figure out where to insert the sequence
        
        null4096 = seqInsert(extended_seq.lower(), insert_start, null_seq.upper())  # insert the 200bp MPRA tile w/ nullomer variants into sequence
        
        extended_nullseqs[key]=null4096  # add to the collection dictionary along with original key
        
    
    return extended_nullseqs, extended_seq

In [12]:
def prettifySeq(original, mut):
    """
    prettify sequence. 
        All matching bases will be written as "."
        All non matching bases will be written w mutated base identity. 
    """
    prettyseq = ""
    for o, m in zip(original, mut):
        if o != m:
            prettyseq += m
        else:
            prettyseq += "."

    return prettyseq

# need to cross over from hs1 to hg38
find coordinates in hs1, map back to hg38, and perform sequence scan/ insertion on hg38 build

# scaffold selection

Scaffold selection? 

See: ./nullomers/bin-lock/Analysis/mpra_corr_scaffold.ipynb

Basically, 60K sequences commonly active|inactive 
across 3 cell lines from Agarwal 2023 were evaluated 
for MPRA activity (log(rna/dna)). I quantified the mean 
activity across these three cell lines, then selected 
the top 3 active and inactive sequences based on the 
ranked mean. 

## pick active coordinates

In [22]:
ACTIVE_COOR=[]
with open(SCAFFOLDS, "r") as reader:
    for line in reader.readlines():
        if "log2" not in line:
            PCT, COOR = line.split("\t")[8], line.split("\t")[12]
            ACTIVE_COOR.append((PCT, COOR))
        

In [23]:
fa_dict = faToDict(FA)
fa_ex_dict = faToDict(FA_EXT_BED)
joint_bedex=mapBed(BED, EXTENDED)

# extract 4096 bp sequences 

In [24]:
# open writer files
writer200 = open(SAMPLE_ROOT+
              f".ext200.fa", "w")
writer4096 = open(SAMPLE_ROOT +
              f".ext4096.fa", "w")
                    

# write files per coor
for pct, coor in ACTIVE_COOR:

    # get the fa sequence for the coordinate
    region_fa = fa_dict[coor]

    
    # dictionary for inserting 200bp into 4096bp sequence
    ext_region = joint_bedex.loc[joint_bedex["coor"]==coor, "coor_e"].iloc[0]

    # get 4096 FA sequence
    region_fa_ext = fa_ex_dict[ext_region]

    # write the original sequences

    writer200.write(f">{coor}_{pct}\n{region_fa}\n")
    writer4096.write(f">{coor}_{pct}\n{region_fa_ext}\n")   
writer200.close(), writer4096.close()

(None, None)

# predict w sei

In [25]:
def launchSei(file, build, gpu):
    
    SEIDIR = os.path.join(os.path.split(file)[0], "sei_predictions")
    print(SEIDIR)
    if os.path.exists(SEIDIR) is False:
        os.mkdir(SEIDIR)
    print(SEIDIR)
    

    SCRIPT = "/wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py"

    cmd = ['python',
           SCRIPT, 
           file, 
           build, 
           SEIDIR,
           str(gpu)
          ]
    print(" ".join(cmd))
    sp.call(" ".join(cmd), shell=True)

## launch sei

In [26]:
fs= [SAMPLE_ROOT +
              f".ext4096.fa"]
fs

['/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/scaffold-max_min.n50.ext4096.fa']

In [27]:
gpu = False
for n, out in enumerate(fs):
    launchSei(out, "hg38", gpu)

/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/sei_predictions
/wynton/home/ahituv/fongsl/MPRA/agarwal_2023/sei_predictions
python /wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/scaffold-max_min.n50.ext4096.fa hg38 /wynton/home/ahituv/fongsl/MPRA/agarwal_2023/sei_predictions False
Your job 3579367 ("interpret_sei.sh") has been submitted
