In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping


In [21]:
# Load dataset
# data = pd.read_csv('drive/MyDrive/train_in.csv').head(2000)
# labels = pd.read_csv('drive/MyDrive/train_out.csv').head(2000)

INPUT_TRAIN_IN = "../../data/train_in.csv"
INPUT_TRAIN_OUT = "../../data/train_out.csv"  
INPUT_TEST_IN = "../../data/test_in.csv"
INPUT_TEST_OUT = "../../data/test_out.csv"

INPUT_VALIDATION_IN = "../../data/valid_in_nucleo.csv"
INPUT_VALIDATION_OUT  = "../../data/valid_out.csv"

TARGET_MODEL_PATH = '../../webapp/model_files'

WINDOW_SIZE =  100 # Final RNA Sequence will be 101 Length 
MAX_LENGTH = (WINDOW_SIZE*2) +1
ENCODING_METHOD = 6 # 1 - ANF Encoding , 2 - One Hot Encoding  , 3 - Complex Network  , 4 - Word2Vec Model , 5 - K-mer with One Hot , 6 - Multi RM Encoding
PERFORM_DATA_BALANCING = True
K_MERS_SIZE = 3


x_train_raw =  pd.read_csv(INPUT_TRAIN_IN, header=None , skiprows=1 )
y_train_raw =  pd.read_csv(INPUT_TRAIN_OUT, header=None , skiprows=1 )

x_test_raw =  pd.read_csv(INPUT_TEST_IN, header=None , skiprows=1)
y_test_raw =  pd.read_csv(INPUT_TEST_OUT, header=None , skiprows=1)

x_valid_raw =  pd.read_csv(INPUT_VALIDATION_IN, header=None , skiprows=1)
y_valid_raw =  pd.read_csv(INPUT_VALIDATION_OUT, header=None , skiprows=1)

x_train_raw = pd.concat([x_train_raw, x_test_raw, x_valid_raw], axis=0, ignore_index=True)
labels = pd.concat([y_train_raw, y_test_raw, y_valid_raw], axis=0, ignore_index=True)


In [22]:
middle_index = (x_train_raw.shape[1] // 2) + 1
STRAT_INEDX =middle_index - WINDOW_SIZE -1 
END_INDEX =middle_index + WINDOW_SIZE 
data =  x_train_raw.iloc[:,STRAT_INEDX :END_INDEX] 

data.shape

(309460, 201)

In [23]:
data

Unnamed: 0,400,401,402,403,404,405,406,407,408,409,...,591,592,593,594,595,596,597,598,599,600
0,G,G,G,C,G,C,C,T,C,G,...,C,C,T,C,T,C,C,T,A,G
1,A,A,C,A,G,T,A,A,A,C,...,T,T,T,T,T,T,T,T,T,C
2,A,A,A,G,A,A,C,A,T,G,...,C,A,G,A,A,T,T,A,G,G
3,G,G,G,T,G,G,T,G,C,C,...,G,T,C,T,C,T,A,A,G,A
4,T,C,A,T,T,G,A,A,G,T,...,A,A,A,T,A,T,G,C,T,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309455,T,T,T,T,C,T,A,G,T,G,...,C,C,A,C,T,A,A,G,N,N
309456,A,T,T,C,T,G,A,T,C,A,...,A,T,G,T,G,T,C,C,A,N
309457,C,A,G,A,A,G,A,A,A,T,...,C,T,G,G,A,A,C,C,A,N
309458,A,A,G,G,T,C,T,C,C,T,...,G,C,T,G,A,G,C,A,N,N


In [24]:
labels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
309455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
309456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
309457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
309458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Convert RNA sequences to one-hot encoding
nucleotides = ["A", "C", "G", "T"]
nucleotide_dict = dict(zip(nucleotides, range(len(nucleotides))))

def one_hot_encode(sequence):
    encoding = np.zeros((len(sequence), len(nucleotides)))
    for i, nucleotide in enumerate(sequence):
        if nucleotide == 'N':
            continue
        encoding[i, nucleotide_dict[nucleotide]] = 1
    return encoding

X = np.array([one_hot_encode(seq) for seq in data.iloc(0)])
y = labels


In [26]:
X.shape

(309460, 201, 4)

In [27]:
print(X)

[[[0. 0. 1. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]
  ...
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  ...
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [0. 0. 1. 0.]]

 ...

 [[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  ...
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  ...
  [1. 0. 0. 0.]
  [0. 0. 0. 0.]
  [0. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 1. 0. 0.]
  ...
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 0.]]]


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
X_train.shape

(247568, 201, 4)

In [30]:
y_train.shape

(247568, 12)

In [31]:
model = Sequential()
model.add(LSTM(64, input_shape=(None, 4)))
model.add(Dense(32, activation='relu'))
model.add(Dense(12, activation='softmax'))

In [32]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary()) 
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 64)                17664     
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 12)                396       
                                                                 
Total params: 20,140
Trainable params: 20,140
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50

KeyboardInterrupt: 

In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 16.89%


In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 46.858
  Accuracy: 0.169


In [None]:
cvscores = []
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
 
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

accuracy: 16.89%
16.89% (+/- 0.00%)


In [None]:
prediction = model.predict(X)



In [None]:
prediction[0]

array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       3.1391329e-17, 0.0000000e+00, 0.0000000e+00, 4.4737793e-38,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00],
      dtype=float32)

In [None]:
# Load dataset
data2 = pd.read_csv('drive/MyDrive/train_in.csv').iloc[[2001, 4001]]
labels2 = pd.read_csv('drive/MyDrive/train_out.csv').iloc[[2001, 4001]]

In [None]:
data2.columns = data2.iloc[0]
data2 = data2[1:]
labels2.columns = labels2.iloc[0]
labels2 = labels2[1:]

# Convert RNA sequences to one-hot encoding
nucleotides = ["A", "C", "G", "T"]
nucleotide_dict = dict(zip(nucleotides, range(len(nucleotides))))

def one_hot_encode(sequence):
    encoding = np.zeros((len(sequence), len(nucleotides)))
    for i, nucleotide in enumerate(sequence):
        encoding[i, nucleotide_dict[nucleotide]] = 1
    return encoding

X2 = np.array([one_hot_encode(seq) for seq in data2.iloc(0)])
y2 = labels2

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))