In [12]:
from Bio import SeqIO
import config_readwrite as crw
import os, sys

In [13]:
config, cfn = crw.read(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "config.ini"))

In [14]:
section = "local_path"
PATH=config[section]["data"]

section = "data"
FASTA=config[section]["fasta"]

FASTA_SEI = os.path.splitext(FASTA)[0] + ".sei_padded.fa"
FASTA_SEI_TEST = os.path.splitext(FASTA)[0] + ".sei_padded.test.fa"

section = "sei"
crw.check(config, section)

config[section]["fasta"] = FASTA_SEI
config[section]["fasta_test"] = FASTA_SEI_TEST
crw.write(config, cfn)

# function

## for inserting sequence

In [4]:
def seqInsert(seq, insert_start, insert_seq):
    """
    insert sequence fragment (insert_seq) at position (insert_start) within full sequence (seq)
    
    Insert into center if insert_start is None 

    return inserted sequence. 
    """

    insert_size = len(insert_seq)
    
    if insert_start is None:
        
        insert_start = (len(seq)/2) - (insert_size/2)
        #print(insert_start)

    insert_end = insert_start + insert_size  # find center end

    return seq[:int(insert_start)] + insert_seq + seq[int(insert_end):]

## parse fasta

In [16]:
def trimSeq(seq, size):
    """ find center of sequence and trim down to size"""
    
    center = len(seq)/2
    start = center - (size/2)
    end = center + (size/2) + 1
    
    return seq[start:end]

def padSeq(fasta):
    """ if sequence is shorter than 4096, pad, else trim"""
    
    max_len, PAD = 4096, "N"
    OUT = os.path.splitext(fasta)[0] + ".sei_padded.fa"
    
    sequences = [s for s in SeqIO.parse(FASTA, 'fasta')]

    padded_sequences = []
    
    for n, seq in enumerate(sequences):
        if len(seq.seq)<max_len:
            padding = PAD*max_len # creating the padding string
            padded_sequences.append(seqInsert(padding, None, seq)) # insert the sequence in the center, append to list
        else:
            padded_sequences.append(trimSeq(seq, max_len))
   # SeqIO.write(padded_sequences, OUT, 'fasta')  # write all the sequences

    return OUT


## Sei commands

In [6]:
def getHandles(fasta):
    """make a dictionary of all the output handles for one fasta file sei run"""
    
    SEI_SRC = "/wynton/home/ahituv/fongsl/bin/sei-framework/"
    SEI_PATH = os.path.join(os.path.split(fasta)[0], "sei_predictions")

    FASTA_CLEAN = os.path.splitext(fasta)[0] + ".clean.fa"
    FASTA_INDEX =  os.path.splitext(FASTA_CLEAN)[0] + ".index.txt"
    
    PADDED  = os.path.splitext(FASTA_CLEAN)[0] + ".sei_padded.fa"
    PATH, HANDLE = os.path.split(PADDED)
    HANDLE = HANDLE.strip(".fa")
    
    
    CHROM_PATH= os.path.join(
        SEI_PATH, "chromatin-profiles-hdf5")
    
    CHROM_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_predictions.h5")
    CLASS_OUT = os.path.join(CHROM_PATH, f"{HANDLE}.raw_sequence_class_scores.npy")
    LABEL_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_row_labels.txt")
    CLASS_TABLE = os.path.join(CHROM_PATH, f"{HANDLE}.raw_sequence_class_scores.table.tsv.gz")
    
    
    path_dict = {
        "FASTA": fasta,
        "FASTA_CLEAN":FASTA_CLEAN, 
        "FASTA_INDEX":FASTA_INDEX,
        "PADDED":PADDED, 
        "PATH": PATH,
        "HANDLE":HANDLE, 
        "SEI_PATH":SEI_PATH, 
        "SEI_SRC":SEI_SRC,
        "CHROM_PATH": CHROM_PATH, 
        "CHROM_PRED": CHROM_OUT,
        "CLASS_PRED" : CLASS_OUT, 
        "LABELS" : LABEL_OUT, 
        "CLASS_TABLE":CLASS_TABLE
    }
    
    return path_dict
def launchSeiPart1(fasta, build, gpu):

    path_dict = getHandles(fasta)
    
    SEI_SRC = path_dict["SEI_SRC"]
    SEI_PATH = path_dict["SEI_PATH"]
    
    CHROM_PRED = path_dict["CHROM_PRED"]
    
    if os.path.exists(SEI_PATH) is False:
        os.mkdir(SEI_PATH)
        
    print(SEI_PATH, "\n", OUT)

    GPU_BOOL = "True" if gpu is True else "False"

    SCRIPT = os.path.join(SEI_SRC, "sarah_scripts/launch_qsub.py")

    cmd = " ".join(['python',
           SCRIPT,
           fasta,
           build,
           SEI_PATH,
           GPU_BOOL
           ])

    if os.path.exists(CHROM_PRED) is False:
        print(cmd)
        #os.system(cmd)
    else:
        print('ran part1 already', CHROM_PRED)



def launchSeiPart2(fasta):

    path_dict = getHandles(fasta)
    
    SEI_SRC =  path_dict["SEI_SRC"]
    os.chdir(SEI_SRC)


    # results file
    CLASS_PRED = path_dict["CLASS_PRED"]

    # infile
    CHROM_PATH = path_dict["CHROM_PATH"]
    CHROM_PRED = path_dict["CHROM_PRED"]
    # sei part 2 command
    cmd = [
            "bash /wynton/home/ahituv/fongsl/bin/sei-framework/2_raw_sc_score.sh",
            CHROM_PRED, 
            CHROM_PATH
            ]

    # if not already run
    if os.path.exists(CLASS_PRED) is False:
        print(" ".join(cmd))
        #os.system(" ".join(cmd))  # rn run in command line because of environment problem that you need to solve. 
    else:
        print("ran part 2", CLASS_PRED)
        


# Params

In [7]:
GPU = True # if running on GPU node, need to login to gpudev1.wynton.ucsf.edu

# Main 

In [17]:
# sequence padding w n
FASTA_SEI = padSeq(FASTA)

In [None]:
# part1
CHROM_PRED = launchSeiPart1(FASTA_SEI, "hg38", GPU)  

#part2
CLASS_PRED = launchSeiPart2(FASTA_SEI)

In [19]:
# write to config
path_dict = getHandles(FASTA)
for key, value in path_dict.items():
    config[section][key] = value
    
crw.write(config, cfn)