# Modeling

## Prepare Dataset

In [1]:
import pandas as pd

In [2]:
train_path = r'/kaggle/input/stanford-ribonanza-rna-folding/train_data_QUICK_START.csv'

In [3]:
train_df = pd.read_csv(train_path)

In [4]:
train_df.head()

Unnamed: 0,sequence_id,sequence,experiment_type,dataset_name,reactivity_0001,reactivity_0002,reactivity_0003,reactivity_0004,reactivity_0005,reactivity_0006,...,reactivity_error_0197,reactivity_error_0198,reactivity_error_0199,reactivity_error_0200,reactivity_error_0201,reactivity_error_0202,reactivity_error_0203,reactivity_error_0204,reactivity_error_0205,reactivity_error_0206
0,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,2A3_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_2A3,,,,,,,...,,,,,,,,,,
1,0000d87cab97,GGGAACGACUCGAGUAGAGUCGAAAAAGAUCGCCACGCACUUACGA...,DMS_MaP,DasLabBigLib_OneMil_RFAM_windows_100mers_DMS,,,,,,,...,,,,,,,,,,
2,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,2A3_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_2A3,,,,,,,...,,,,,,,,,,
3,0001ca9d21b0,GGGAACGACUCGAGUAGAGUCGAAAAGGUGGCCGGCAGAAUCGCGA...,DMS_MaP,DasLabBigLib_OneMil_OpenKnot_Round_2_train_DMS,,,,,,,...,,,,,,,,,,
4,00021f968267,GGGAACGACUCGAGUAGAGUCGAAAACAUUGUUAAUGCCUAUAUUA...,2A3_MaP,DasLabBigLib_OneMil_Replicates_from_previous_l...,,,,,,,...,,,,,,,,,,


In [5]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(train_df, test_size=0.2, random_state=283)



In [6]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [7]:
max_seq_length = 206
nucleotides = 'ACGU'

In [8]:
def str_to_seq(s):
    mapping = {nucleotide: idx for idx, nucleotide in enumerate(nucleotides)}
    return [mapping[c] for c in s]


class RNADataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.label_idx = [idx for idx, column in enumerate(self.df.columns) if not column.startswith('reactivity_error') and column.startswith('reactivity')]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        input_seq = self.df['sequence'].iloc[idx]
        input_seq = str_to_seq(input_seq)
        input_seq = torch.LongTensor(input_seq)
        input_seq = F.one_hot(input_seq, num_classes=len(nucleotides))
        input_seq = input_seq.float()
        input_seq = F.pad(input_seq, pad=(0, 0, 0, max_seq_length - input_seq.size(0)))
        
        label_seq = self.df.iloc[idx, self.label_idx]
        label_seq = torch.FloatTensor(label_seq)
        label_seq = torch.nan_to_num(label_seq)
        label_seq = F.pad(label_seq, pad=(0, max_seq_length - label_seq.size(0)))
        
        return input_seq, label_seq

In [9]:
batch_size = 128

In [10]:
train_loader = DataLoader(RNADataset(train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(RNADataset(val), batch_size=batch_size, shuffle=True)

In [11]:
for inputs, labels in train_loader:
    print(inputs)
    print(labels)
    break

tensor([[[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        ...,

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],

        [[0., 0., 1., 0.],
         [0., 0., 1., 0.],
         [0., 0., 1., 0.],
         ...,
 

## Define and Train Model

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
class RNAModel(nn.Module):
    def __init__(self, embed_dim, d_model=128):
        super().__init__()
        
        self.conv = nn.Conv1d(embed_dim, d_model, kernel_size=3, padding=1)
        self.te = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=4, batch_first=True),
            num_layers=4
        )
        self.linear = nn.Linear(d_model, 1)
        
    def forward(self, x):
#         x += pos emb
        x = self.conv(x.transpose(-1, -2)).transpose(-1, -2)
        x = self.te(x)
        x = self.linear(x)
        
        return x.squeeze()

In [22]:
model = RNAModel(embed_dim=4).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

num_epochs = 10

In [None]:
from tqdm import tqdm

for epoch in tqdm(range(num_epochs)):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

 30%|███       | 3/10 [21:12<49:30, 424.38s/it]  