In [26]:
from Bio import SeqIO
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd

In [None]:
print('Libraries loaded successfully')

In [27]:
def preprocessing(fasta_file, temp_file):
    l = 1000
    fp = open(temp_file, 'w')
    input_handle = open(fasta_file, "r")
    for seq_record in SeqIO.parse(input_handle, "fasta"):   
        seq_id = seq_record.id
        seq = seq_record.seq
        if len(seq) <= 1000:
            fp.write('>%s\n'%(seq_id))
            fp.write('%s\n'%(seq.strip()))
        else:
            for i in range(0, len(seq)-l+1, 100):
                new_seq_id = '%s_SEPARATED_SEQUENCE_(%s_%s)' % (seq_id, i+1, i+l+1)
                new_seq = seq[i:i+l]
                fp.write('>%s\n'%(new_seq_id))
                fp.write('%s\n'%(new_seq))
    
    input_handle.close()
    fp.close()
    return 

In [28]:
def fill_aa(seq):
    fill_aa_cnt = 1000 - len(seq)
    add_aa_seq = '_' * fill_aa_cnt
    new_seq = seq + add_aa_seq
    return new_seq

In [29]:
def score_info():
    aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X', '_']
    aa_score_info = {}
    for aa in aa_list:
        for aa2 in aa_list:
            if aa == aa2:
                aa_score_info[(aa, aa2)] = 1.0
                aa_score_info[(aa2, aa)] = 1.0
            else:
                aa_score_info[(aa, aa2)] = 0.0
                aa_score_info[(aa2, aa)] = 0.0
    return aa_score_info

In [30]:
def one_hot_encoding(seq, aa_score_info):
    data = np.zeros((1000, 21), dtype=np.float32)
    aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']
    for i, aa in enumerate(seq):
        for j, aa2 in enumerate(aa_list):
            data[i, j] = aa_score_info[aa, aa2]
    return data

In [31]:
def run_one_hot_encoding(fasta_file, temp_file):
    aa_score_info = score_info()
    fp = open(temp_file, 'w')
    feature_names = ['Feature%s'%(i) for i in range(1, 21001)] 
    
    fp.write('%s\n'%(','.join(['ID']+feature_names)))
    
    input_handle = open(fasta_file, "r")
    for seq_record in SeqIO.parse(input_handle, "fasta"):   
        try:
            seq_id = seq_record.id
            seq = seq_record.seq
            if len(seq) >= 10 and len(seq) <= 1000:
                if len(seq) < 1000:
                    seq = fill_aa(seq)
                encoded_vector = one_hot_encoding(seq, aa_score_info)
                flatten_encoded_vector = encoded_vector.flatten()
                flatten_encoded_vector_str = [str(each_val) for each_val in flatten_encoded_vector]
                fp.write('%s\n'%(','.join([seq_id]+flatten_encoded_vector_str)))
        except:
            pass
    input_handle.close()
    fp.close()

In [32]:
def create_df(df):
    seq_ids = list(df.index)
    X_temp = df.values
    new_X = []
    for i in range(len(X_temp)):
        temp = np.reshape(X_temp[i], (1000, 21))
        new_X.append(temp)
    
    X = np.asarray(new_X)
    X = X.reshape(X.shape[0], 1000, 21, 1)
    return X

In [33]:
#creating dataset
def create_dataset(fasta, val):
  preprocessing(fasta, 'tmp.txt')
  run_one_hot_encoding('tmp.txt', 'tmp2.txt')
  temp_df = pd.read_csv('tmp2.txt', index_col=0)
  dff=create_df(temp_df)
  X=dff
  y=np.full(len(temp_df.index), val)
  return X, y

In [None]:
print('functions defined')

In [None]:
X_non_enzyme, y_non_enzyme=create_dataset('non_enzyme_40.fasta',0)
print('non enzyme dataset loaded')

In [None]:
!rm tmp.txt

In [None]:
!rm tmp2.txt

In [1]:
X_enzyme, y_enzyme=create_dataset('enzyme_40.fasta',1)
print('enzyme dataset loaded')

NameError: name 'create_dataset' is not defined

In [None]:
!rm tmp.txt

In [None]:
!rm tmp2.txt

In [14]:
X=np.concatenate((X_enzyme, X_non_enzyme), axis=0)
print('concatenated: shape of X:', X.shape)

(11247, 1000, 21, 1)

In [15]:
y=np.concatenate((y_enzyme, y_non_enzyme), axis=0)
print('concatenated: shape of y:',y.shape)

(11247,)

In [17]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, 2)
#np.save('X',X)
#np.save('y',y)

In [21]:
#setting old varibales zero to save memory
X_enzyme, X_non_enzyme,y_enzyme, y_non_enzyme=[],[],[],[]
print('memory cleared')

In [6]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=10)
print('train and validation split done')

In [15]:
np.save('X_train',X_train)
np.save('y_train',y_train)
np.save('X_val',X_val)
np.save('y_val',y_val)

print('numpy files saved')

In [24]:
#setting X, y to zero to save memory
X,y=[],[]

print('clearing memory')

print('Files for training ready')