In [10]:
!pip install tqdm`

In [11]:
import os
import numpy as np
import pandas as pd
import tqdm

In [12]:
print('Libraries loaded successfully')

In [3]:
def preprocessing(fasta_file, temp_file):
    l = 1000
    fp = open(temp_file, 'w')
    
    with open (fasta_file, 'r') as file:
        seq=[]
        seq_id=[]
        for line in file:
            if line.startswith('>'):
                seq_id.append(line.strip('>').strip('\n'))
            else:
                seq.append(line.strip('\n'))
            
    for j, sq in enumerate(seq):   
        if len(sq) <= 1000:
            fp.write('>%s\n'%(seq_id[j]))
            fp.write('%s\n'%(seq[j]))
        else:
            for i in range(0, len(seq)-l+1, 100):
                new_seq_id = '%s_SEPARATED_SEQUENCE_(%s_%s)' % (seq_id[j], i+1, i+l+1)
                new_seq = sq[i:i+l]
                fp.write('>%s\n'%(new_seq_id))
                fp.write('%s\n'%(new_seq))
    
    #input_handle.close()
    fp.close()
    return 

In [4]:
def fill_aa(seq):
    fill_aa_cnt = 1000 - len(seq)
    add_aa_seq = '_' * fill_aa_cnt
    new_seq = seq + add_aa_seq
    return new_seq

In [5]:
def score_info():
    aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X', '_']
    aa_score_info = {}
    for aa in aa_list:
        for aa2 in aa_list:
            if aa == aa2:
                aa_score_info[(aa, aa2)] = 1.0
                aa_score_info[(aa2, aa)] = 1.0
            else:
                aa_score_info[(aa, aa2)] = 0.0
                aa_score_info[(aa2, aa)] = 0.0
    return aa_score_info

In [6]:
def one_hot_encoding(seq, aa_score_info):
    data = np.zeros((1000, 21), dtype=np.float32)
    aa_list = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']
    for i, aa in enumerate(seq):
        for j, aa2 in enumerate(aa_list):
            data[i, j] = aa_score_info[aa, aa2]
    return data

In [7]:
def run_one_hot_encoding(fasta_file2, temp_file):
    aa_score_info = score_info()
    fp = open(temp_file, 'w')
    feature_names = ['Feature%s'%(i) for i in range(1, 21001)] 
    
    fp.write('%s\n'%(','.join(['ID']+feature_names)))
    
    with open (fasta_file2, 'r') as file:
        seq=[]
        seq_id=[]
        for line in file:
            if line.startswith('>'):
                seq_id.append(line.strip('>').strip('\n'))
            else:
                seq.append(line.strip('\n'))
    for j, sq in enumerate(seq):
        try:
            if len(sq) >= 10 and len(sq) <= 1000:
                if len(sq) < 1000:
                    sq = fill_aa(sq)
                encoded_vector = one_hot_encoding(sq, aa_score_info)
                flatten_encoded_vector = encoded_vector.flatten()
                flatten_encoded_vector_str = [str(each_val) for each_val in flatten_encoded_vector]
                fp.write('%s\n'%(','.join([seq_id[j]]+flatten_encoded_vector_str)))
            else:
                pass
        except:
            pass
    fp.close()

In [8]:
def create_df(df):
    seq_ids = list(df.index)
    X_temp = df.values
    new_X = []
    for i in tqdm(range(len(X_temp))):
        temp = np.reshape(X_temp[i], (1000, 21))
        new_X.append(temp)
    
    X = np.asarray(new_X)
    X = X.reshape(X.shape[0], 1000, 21, 1)
    print('numpy array created')
    return X

In [12]:
#creating dataset
def create_dataset(fasta, tmp_file):
    preprocessing(fasta, 'tmp.txt')
    print('preprocessing done')
    run_one_hot_encoding('tmp.txt', tmp_file)
    print('one hot encoding done')
    return

In [13]:
def create_array(temp_file, val):
    temp_df = pd.read_csv(temp_file, index_col=0)
    print('temp df loaded')
    X=create_df(temp_df)
    print('X created')
    y=np.full(len(temp_df.index), val)
    print('y created')

In [14]:
print('functions defined')

functions defined


In [None]:
create_dataset('non_enzyme_40.fasta', 'tmp_non_enz.txt')
print('non enzyme dataset loaded')

In [None]:
os.remove('tmp.txt')
print('removed tmp seq file')

In [None]:
create_dataset('enzyme_40.fasta', 'tmp_enz.txt')
print('enzyme dataset loaded')

In [None]:
os.remove('tmp.txt')
print('removed tmp seq file')

In [11]:
X_non_enzyme, y_non_enzyme=create_array('tmp_non_enz.txt',0)
print('non enzyme dataset loaded')

KeyboardInterrupt: 

In [None]:
os.remove('tmp_non_enz.txt')

In [1]:
X_enzyme, y_enzyme=create_array('tmp_enz.txt',1)
print('enzyme dataset loaded')

NameError: name 'create_dataset' is not defined

In [14]:
X=np.concatenate((X_enzyme, X_non_enzyme), axis=0)
print('concatenated: shape of X:', X.shape)

(11247, 1000, 21, 1)

In [15]:
y=np.concatenate((y_enzyme, y_non_enzyme), axis=0)
print('concatenated: shape of y:',y.shape)

(11247,)

In [6]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, 2)
#np.save('X',X)
#np.save('y',y)

In [21]:
#setting old varibales zero to save memory
X_enzyme, X_non_enzyme,y_enzyme, y_non_enzyme=[],[],[],[]
print('memory cleared')

In [5]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=10)
print('train and validation split done')

In [15]:
np.save('X_train',X_train)
np.save('y_train',y_train)
np.save('X_val',X_val)
np.save('y_val',y_val)

print('numpy files saved')

In [24]:
#setting X, y to zero to save memory
X,y=[],[]

print('clearing memory')

print('Files for training ready')