In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping


In [2]:
# Load dataset
data = pd.read_csv('drive/MyDrive/train_in.csv').head(2000)
labels = pd.read_csv('drive/MyDrive/train_out.csv').head(2000)

In [3]:
data.columns = data.iloc[0]
data = data[1:]
data

Unnamed: 0,C,C.1,C.2,C.3,A,T,A.1,C.4,C.5,C.6,...,C.7,G,G.1,G.2,G.3,G.4,T.1,C.8,C.9,T.2
1,T,C,T,C,C,T,G,C,C,T,...,G,G,G,T,G,A,C,A,G,A
2,C,A,G,A,T,A,G,T,A,A,...,A,A,G,C,T,C,T,G,T,T
3,T,G,T,C,T,T,T,T,A,C,...,T,G,A,A,G,A,G,G,A,C
4,A,T,T,A,C,T,T,A,A,T,...,T,T,T,C,G,T,T,T,T,G
5,A,A,A,G,A,C,T,T,T,T,...,T,G,T,T,T,G,T,T,A,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,T,C,C,C,A,G,C,T,C,A,...,G,T,C,C,T,C,T,C,G,G
1996,A,G,T,T,G,C,T,G,T,A,...,C,T,A,A,T,G,A,T,C,T
1997,A,T,A,G,T,G,G,T,T,A,...,G,A,G,A,A,T,G,T,A,A
1998,G,A,C,T,G,T,A,T,T,T,...,C,A,A,C,A,T,G,C,A,T


In [4]:
labels.columns = labels.iloc[0]
labels = labels[1:]
labels

Unnamed: 0,1.0,0.0,0.0.1,0.0.2,0.0.3,0.0.4,0.0.5,0.0.6,0.0.7,0.0.8,0.0.9,0.0.10
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Convert RNA sequences to one-hot encoding
nucleotides = ["A", "C", "G", "T"]
nucleotide_dict = dict(zip(nucleotides, range(len(nucleotides))))

def one_hot_encode(sequence):
    encoding = np.zeros((len(sequence), len(nucleotides)))
    for i, nucleotide in enumerate(sequence):
        encoding[i, nucleotide_dict[nucleotide]] = 1
    return encoding

X = np.array([one_hot_encode(seq) for seq in data.iloc(0)])
y = labels


In [6]:
X.shape

(1999, 1001, 4)

In [7]:
print(X)

[[[0. 0. 0. 1.]
  [0. 1. 0. 0.]
  [0. 0. 0. 1.]
  ...
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  [1. 0. 0. 0.]]

 [[0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 1. 0.]
  ...
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  [0. 0. 0. 1.]]

 [[0. 0. 0. 1.]
  [0. 0. 1. 0.]
  [0. 0. 0. 1.]
  ...
  [0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]]

 ...

 [[1. 0. 0. 0.]
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]
  ...
  [0. 0. 0. 1.]
  [1. 0. 0. 0.]
  [1. 0. 0. 0.]]

 [[0. 0. 1. 0.]
  [1. 0. 0. 0.]
  [0. 1. 0. 0.]
  ...
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]]

 [[1. 0. 0. 0.]
  [1. 0. 0. 0.]
  [0. 0. 0. 1.]
  ...
  [0. 1. 0. 0.]
  [0. 1. 0. 0.]
  [1. 0. 0. 0.]]]


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
X_train.shape

(1599, 1001, 4)

In [10]:
y_train.shape

(1599, 12)

In [11]:
model = Sequential()
model.add(LSTM(64, input_shape=(None, 4)))
model.add(Dense(32, activation='relu'))
model.add(Dense(12, activation='softmax'))

In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 64)                17664     
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                                 
 dense_1 (Dense)             (None, 12)                396       
                                                                 
Total params: 20,140
Trainable params: 20,140
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27

In [13]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 100.00%


In [14]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.027
  Accuracy: 1.000


In [15]:
cvscores = []
scores = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
cvscores.append(scores[1] * 100)
 
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

accuracy: 100.00%
100.00% (+/- 0.00%)


In [16]:
prediction = model.predict(X)



In [17]:
prediction[0]

array([9.6224630e-01, 7.7189761e-04, 2.5865505e-03, 5.2819069e-04,
       7.4316286e-03, 3.3304439e-04, 8.6469541e-04, 4.6313326e-03,
       2.3344727e-03, 1.4557529e-02, 1.4069700e-03, 2.3072753e-03],
      dtype=float32)

In [18]:
# Load dataset
data2 = pd.read_csv('drive/MyDrive/train_in.csv').iloc[[2001, 4001]]
labels2 = pd.read_csv('drive/MyDrive/train_out.csv').iloc[[2001, 4001]]

In [19]:
data2.columns = data2.iloc[0]
data2 = data2[1:]
labels2.columns = labels2.iloc[0]
labels2 = labels2[1:]

# Convert RNA sequences to one-hot encoding
nucleotides = ["A", "C", "G", "T"]
nucleotide_dict = dict(zip(nucleotides, range(len(nucleotides))))

def one_hot_encode(sequence):
    encoding = np.zeros((len(sequence), len(nucleotides)))
    for i, nucleotide in enumerate(sequence):
        encoding[i, nucleotide_dict[nucleotide]] = 1
    return encoding

X2 = np.array([one_hot_encode(seq) for seq in data2.iloc(0)])
y2 = labels2

In [20]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 100.00%
