In [34]:
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
import config_readwrite as crw
import numpy as np
import os, sys
import pandas as pd

In [2]:
config, cfn = crw.read(os.path.join(os.getcwd(), "config.neuron.mpra.ini"))

In [14]:
FASTA="/wynton/group/ahituv/biomarin/library_2/Design/biomarin-lib2-hg38-final.fa"


write_config={
    "FASTA":FASTA,
    "FASTA_SEI": os.path.splitext(FASTA)[0] + ".clean.sei_padded.fa", 
    "FASTA_CLEAN":os.path.splitext(FASTA)[0] + '.clean.fa',
    "FASTA_INDEX":os.path.splitext(FASTA)[0] + '.clean.index.txt',
}
FASTA_CLEAN = write_config["FASTA_CLEAN"]
FASTA_INDEX = write_config["FASTA_INDEX"]
section = "sei"
crw.check(config, section)

for key, value in write_config.items():
    config[section][key] = value

crw.write(config, cfn)

In [15]:
def cleanFasta(fasta):
    """
    clean fasta files for sei run by indexing sequence ids
    
    goal 
        index sequence ids 
        remove sequence id duplicates
    
    requires
        Bio.SeqIO.FastaIO import SimpleFastaParser
        getHandles function
        
    input 
        fasta file (str) - path to fasta file
        path_dict (dictionary) - dictionary with files

    method    
        1. get dictionary of file handles
        2. instantiate fasta clean adn index handles to write
        3. open files to write
        4. make empty list to collect unique sequence ids
        5. open fasta and iterate through rows, making uniform sequence ids
            5.1. check to make sure id has not been processed, if not, append to id_list
            5.2. write the index_id and the sequence
            5.3. write the index_id and sequence_id to a separate file.  
        
    writes
        FASTA_CLEAN - fasta file with sequence id indexes
        FASTA_INDEX - .txt file with sequence id index + sequence id
    """
    
    #1
    path_dict = getHandles(fasta)
    
    #2 get files (strs) to write
    FASTA_CLEAN, FASTA_INDEX = path_dict["FASTA_CLEAN"], path_dict["FASTA_INDEX"]
    
    #3 open the write files
    writer, indexer = open(FASTA_CLEAN, "w"), open(FASTA_INDEX, "w")

    #4 list to collect sequence ids
    id_list = []
    
    #5 open the fasta
    with open(fasta, "r") as handle:
        for i, value in enumerate(SimpleFastaParser(handle)):
            
            index_id = f"seq.{i}"  # uniform sequence id
            id_, seq = value
            
            #5.1
            if id_ not in id_list:
            
                id_list.append(id_)
                
                #5.2.
                writer.write(f'>{index_id}\n{seq}\n')
                
                #5.3.
                indexer.write(f'{index_id}\t{id_}\n')
            else:
                print("sequence_id duplicate!", id_)
                
    writer.close(), indexer.close()

# function

## for inserting sequence

In [8]:
def seqInsert(seq, insert_start, insert_seq):
    """
    insert sequence fragment (insert_seq) at position (insert_start) within full sequence (seq)
    
    Insert into center if insert_start is None 

    return inserted sequence. 
    """

    insert_size = len(insert_seq)
    
    if insert_start is None:
        
        insert_start = (len(seq)/2) - (insert_size/2)
        #print(insert_start)

    insert_end = insert_start + insert_size  # find center end

    return seq[:int(insert_start)] + insert_seq + seq[int(insert_end):]

## parse fasta

In [22]:
def trimSeq(seq, size):
    """ find center of sequence and trim down to size"""
    
    center = len(seq)/2
    start = center - (size/2)
    end = center + (size/2) + 1
    
    return seq[start:end]

def padSeq(fasta):
    """ if sequence is shorter than 4096, pad, else trim"""
    
    max_len, PAD = 4096, "N"
    OUT = os.path.splitext(fasta)[0] + ".sei_padded.fa"
    
    sequences = [s for s in SeqIO.parse(fasta, 'fasta')]

    padded_sequences = []
    
    for n, seq in enumerate(sequences):
        if len(seq.seq)<max_len:
            padding = PAD*max_len # creating the padding string
            padded_sequences.append(seqInsert(padding, None, seq)) # insert the sequence in the center, append to list
        else:
            padded_sequences.append(trimSeq(seq, max_len))
    SeqIO.write(padded_sequences, OUT, 'fasta')  # write all the sequences

    return OUT


## Sei commands

In [10]:
def getHandles(fasta):

    SEI_SRC = "/wynton/home/ahituv/fongsl/bin/sei-framework/"
    SEI_PATH = os.path.join(os.path.split(fasta)[0], "sei_predictions")

    FASTA_CLEAN = os.path.splitext(fasta)[0] + '.clean.fa',
    PADDED  = os.path.splitext(fasta)[0] + ".clean.sei_padded.fa"
    INDEX = os.path.splitext(fasta)[0] + '.clean.index.txt'
    
    PATH, HANDLE = os.path.split(PADDED)
    HANDLE = HANDLE.strip(".fa")
    
    
    CHROM_PATH= os.path.join(
        SEI_PATH, "chromatin-profiles-hdf5")
    
    CHROM_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_predictions.h5")
    CLASS_OUT = os.path.join(CHROM_PATH, f"{HANDLE}.raw_sequence_class_scores.npy")
    LABEL_OUT = os.path.join(CHROM_PATH, f"{HANDLE}_row_labels.txt")
    
    
    path_dict = {
        "FASTA": fasta,
        "FASTA_CLEAN":FASTA_CLEAN, 
        "PADDED":PADDED,
        'INDEX':INDEX, 
        "PATH": PATH,
        "HANDLE":HANDLE, 
        "SEI_PATH":SEI_PATH, 
        "SEI_SRC":SEI_SRC,
        "CHROM_PATH": CHROM_PATH, 
        "CHROM_PRED": CHROM_OUT,
        "CLASS_PRED" : CLASS_OUT, 
        "LABELS" : LABEL_OUT
    }
    
    return path_dict

def launchSei(fasta, build, gpu):

    
    path_dict = getHandles(fasta)
    
    SEI_SRC = path_dict["SEI_SRC"]
    SEI_PATH = path_dict["SEI_PATH"]
    
    CHROM_PRED = path_dict["CHROM_PRED"]
    CHROM_PATH = path_dict["CHROM_PATH"]
    CLASS_PRED = path_dict["CLASS_PRED"]
    
    if os.path.exists(SEI_PATH) is False:
        os.mkdir(SEI_PATH)
        
    print(SEI_PATH, "\n", OUT)

    GPU_BOOL = "True" if gpu is True else "False"

    SCRIPT = os.path.join(SEI_SRC, "sarah_scripts/launch_qsub.py")

    cmd = " ".join(['python',
           SCRIPT,
           fasta,
           build,
           SEI_PATH,
           GPU_BOOL,
           CHROM_PRED, 
           CHROM_PATH
           ])

    if os.path.exists(CLASS_PRED) is False:
        print(cmd)
        #os.system(cmd)
    else:
        print('ran sei already', CLASS_PRED)


In [30]:
path_dict["CHROM_PRED"]

'/wynton/group/ahituv/biomarin/library_2/Design/sei_predictions/chromatin-profiles-hdf5/biomarin-lib2-hg38-final.sei_padded_predictions.h5'

# Params

In [11]:
GPU = True # if running on GPU node, need to login to gpudev1.wynton.ucsf.edu

# Main 

In [23]:
# sequence padding w n
FASTA_SEI = padSeq(FASTA_CLEAN)

In [26]:
# part1
launchSei(FASTA_SEI, "hg38", GPU)  

/wynton/group/ahituv/biomarin/library_2/Design/sei_predictions 
 /wynton/group/ahituv/biomarin/library_2/Design/sei_predictions/chromatin-profiles-hdf5/biomarin-lib2-hg38-final.clean.sei_padded.sei_padded_predictions.h5
python /wynton/home/ahituv/fongsl/bin/sei-framework/sarah_scripts/launch_qsub.py /wynton/group/ahituv/biomarin/library_2/Design/biomarin-lib2-hg38-final.clean.sei_padded.fa hg38 /wynton/group/ahituv/biomarin/library_2/Design/sei_predictions True
bash /wynton/home/ahituv/fongsl/bin/sei-framework/2_raw_sc_score.sh /wynton/group/ahituv/biomarin/library_2/Design/sei_predictions/chromatin-profiles-hdf5/biomarin-lib2-hg38-final.clean.sei_padded.sei_padded_predictions.h5 /wynton/group/ahituv/biomarin/library_2/Design/sei_predictions/chromatin-profiles-hdf5


# write results to config

In [40]:
# write to config
path_dict = getHandles(FASTA_CLEAN)
for key, value in path_dict.items():
    config[section][key] = value
    
crw.write(config, cfn)

# format table for ML

In [56]:
def returnSequenceClassLabels():
    file = "/wynton/home/ahituv/fongsl/bin/sei-framework/sequence_class_labels.csv"
    lab = pd.read_csv(file)

    return lab


def processLabel(label_file, index):
    """
    input 
        label_file  (str) - path with the labels for the sequences run through the DNN

    Method
        1. opens sequence label file as pd dataframe
        2. drop the index column
        3. make UCSC genome browswer coordinates
        4. avg pct? 
        5. "bin"?
    """

    # read label file as pd dataframe
    lab = pd.read_csv(label_file, sep='\t')
    lab = lab.drop(columns=["index"])  # redundant index column

    ind = pd.read_csv(index, sep='\t', header=None, names=["name", 'tile.coor'])
    lab = pd.merge(lab, ind)
    return lab[["tile.coor"]]


def seqClassLookup(annot_list):
    print(seqClass.loc[seqClass[seqClass.columns[0]].isin(annot_list)])



## get sequence class labels

In [42]:
"""
# get sequence class labels. See Methods section of Chen 2022 for interpretation of these PC labels.
# apparently labels >40 are low active/heterochromatin. 
# Make up <2% of the genome. But 2% of the genome can still be significant.
"""

seqClass = returnSequenceClassLabels()

seqClass

Unnamed: 0,#Sequence class label,Sequence class name,Rank by size,Group
0,PC1,Polycomb / Heterochromatin,0,PC
1,L1,Low signal,1,L
2,TN1,Transcription,2,TN
3,TN2,Transcription,3,TN
4,L2,Low signal,4,L
...,...,...,...,...
57,L/HET,L/HET,56,L/HET
58,L/HET,L/HET,57,L/HET
59,L/HET,L/HET,58,L/HET
60,L/HET,L/HET,59,L/HET


## load and merge data w labels 

In [57]:
# open npy data
data = np.load(path_dict["CLASS_PRED"], allow_pickle=True)

labels = path_dict["LABELS"]
index=FASTA_INDEX

# turn into pd dataframe
df = pd.DataFrame(data)

# rename columns
df.columns = list(seqClass["#Sequence class label"])[:-1]

# process labels file
lab = processLabel(labels, index)

#lab = pd.merge(lab, act[["avg_activity", "coor"]], how="left")

# add labels and data together
df = pd.merge(lab, df, left_index=True, right_index=True)

df.head()

Unnamed: 0,tile.coor,PC1,L1,TN1,TN2,L2,E1,E2,E3,L3,...,L/HET,L/HET.1,L/HET.2,L/HET.3,L/HET.4,L/HET.5,L/HET.6,L/HET.7,L/HET.8,L/HET.9
0,chr13:112073561-112073831,7.209285,4.05559,1.129364,1.672152,2.010914,6.306994,3.543542,4.151067,3.872594,...,0.136094,1.029508,2.706808,5.068393,2.676638,1.974579,2.257275,1.949697,3.996029,2.041928
1,chr13:112073581-112073851,6.763016,3.67692,1.057404,1.526739,1.768337,5.004991,3.244346,3.477757,3.827772,...,0.143304,0.983652,2.671671,4.851279,2.377328,1.874483,2.30502,1.756132,3.79252,1.924091
2,chr13:112073601-112073871,5.812169,3.321641,0.868044,1.329495,1.599547,4.47778,2.515773,2.977341,3.491707,...,0.098984,0.836275,2.267405,4.201287,2.078791,1.583026,2.04803,1.567999,3.405484,1.614445
3,chr13:112073621-112073891,5.819405,3.612925,1.029199,1.532729,1.780364,5.322221,3.231455,3.482349,3.605277,...,0.122046,1.111357,2.134765,4.067479,2.413321,1.473149,1.982637,1.517266,3.490395,1.56392
4,chr13:112073641-112073911,4.844888,2.97364,0.758278,1.205471,1.48495,4.356035,2.2875,2.76016,3.087883,...,0.090943,0.792066,1.787849,3.338714,1.787607,1.283991,1.699739,1.332905,2.913985,1.39911


## write the table

In [58]:
SEI_PRED = os.path.join(os.path.splitext(path_dict["CLASS_PRED"])[0] + ".table.tsv.gz")
df.to_csv(SEI_PRED, sep='\t', compression="gzip", index=False)

In [59]:
SEI_PRED

'/wynton/group/ahituv/biomarin/library_2/Design/sei_predictions/chromatin-profiles-hdf5/biomarin-lib2-hg38-final.clean.sei_padded.raw_sequence_class_scores.table.tsv.gz'