In [1]:
import os 
import pickle
import numpy as np
import pandas as pd

from Bio import SeqIO

In [2]:
# PredNTS

input_data_folder = os.path.join("Data", "iNitroY-Deep-Dataset")
neg_data_file = "cdhit70-nitrotyr-neg.fasta"
pos_data_file = "raw-nitrotyrosine-pos.fasta"

output_data_folder = "iNitroY_ensemble_data"
output_file = "iNitroY_full_SEQ.csv"

In [3]:
def read_fasta_file(file_path):
    
    openFile = open(file_path)
    fastaSequences = SeqIO.parse(openFile, "fasta")

    name_list = []
    seq_list = []

    for fasta in fastaSequences: 
        name_list.append(fasta.id)
        seq_list.append(str(fasta.seq))

    openFile.close()
    
    return name_list, seq_list

In [4]:
##################################################################################
##### read positive and negative files
##################################################################################

pos_file_path = os.path.join(input_data_folder, pos_data_file)
_, pos_seq_list = read_fasta_file(pos_file_path)

neg_file_path = os.path.join(input_data_folder, neg_data_file)
_, neg_seq_list = read_fasta_file(neg_file_path)

pos_seq_list = [val.replace('X', '-') for val in pos_seq_list]
neg_seq_list = [val.replace('X', '-') for val in neg_seq_list]

# remove duplicates in data
# pos_seq_list = list(set(pos_seq_list))
# neg_seq_list = list(set(neg_seq_list))

all_seq_list = pos_seq_list + neg_seq_list

all_seq_label_list = ([1] * len(pos_seq_list)) + ([0] * len(neg_seq_list))

In [5]:
len(all_seq_list), len(all_seq_label_list)

(834, 834)

In [6]:
initro_df = pd.DataFrame(list(zip(all_seq_label_list, all_seq_list)),
                         columns =['Label', 'Sequence'])

In [7]:
outpath = os.path.join(output_data_folder, output_file)

initro_df.to_csv(path_or_buf=outpath, sep=',', header=False, index=False)