# RNA Reactivity Prediction

## 1. Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional
from keras.optimizers import Adam

import warnings
warnings.filterwarnings('ignore')

## 2. Data Loading and Preliminary Exploration

In [None]:
data = pd.read_csv('/kaggle/input/stanford-ribonanza-rna-folding/train_data.csv')

## 3. Data Preprocessing

In [None]:
# Finding max length of sequence for padding
max_sequence_length = data['sequence'].apply(len).max()

# Padding the sequences with a specific character, say 'N'
data['padded_sequence'] = data['sequence'].apply(lambda x: x.ljust(max_sequence_length, 'N'))

# Encode sequences to numerical format
encoder = LabelEncoder()
data['sequence_encoded'] = data['padded_sequence'].apply(lambda x: encoder.fit_transform(list(x)))

# Convert the encoded sequences to a matrix format
X = np.array(data['sequence_encoded'].tolist())
y = data['reactivity_0001'].values

# Reshaping the input for LSTM model
X = X.reshape(X.shape[0], X.shape[1], 1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ## 4. Deep Learning Model with LSTM

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_absolute_error')
model.summary()

history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val), verbose=1)

## 5. Model Evaluation

In [None]:
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on the validation set: {mae:.4f}')

## 6. Visualizing Loss

In [None]:
plt.figure(figsize=(12,6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training and Validation Loss over Epochs')
plt.show()