# RNN Challenge

In [1]:
import numpy as np

with open('rnn-challenge-data.npz', 'rb') as f:
    X = np.load(f)
    data_x = X['data_x']
    data_y = X['data_y']
    val_x = X['val_x']
    val_y = X['val_y']
    test_x = X['test_x']

# TRAINING DATA: INPUT (x) AND OUTPUT (y)
print(data_x.shape, data_x.dtype)
print(data_y.shape, data_y.dtype)

# VALIDATION DATA: INPUT (x) AND OUTPUT (y)
print(val_x.shape, val_x.dtype)
print(val_y.shape, val_y.dtype)

# TEST DATA: INPUT (x) ONLY
print(test_x.shape, test_x.dtype)

# PREDICT prediction FROM test_x


(400,) <U400
(400,) int64
(100,) <U1200
(100,) int64
(250,) <U2000


In [7]:
{"A":0,"C":1,"G":2,"T":3}["C"]#test_x[0]

1

## Preprocess Data

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [3]:
device = torch.device("cpu")
ctx = {"device": device, "dtype": torch.float32}

In [13]:
def parse_bases(data):
    all_data = []
    for x in data:
        string = []
        for char in x:
            string.append({"A":0,"C":1,"G":2,"T":3}[char])
        tensor = torch.tensor(string)
        one_hot = torch.nn.functional.one_hot(tensor)
        all_data.append(one_hot)
    return torch.stack(all_data).to(**ctx)

parse_bases(val_x).shape

torch.Size([100, 1200, 4])

In [14]:
train_data = TensorDataset(parse_bases(data_x), torch.tensor(data_y))
train_loader = DataLoader(
    train_data,
    batch_size=16,
    shuffle=True
)
val_data = TensorDataset(parse_bases(val_x), torch.tensor(val_y))
val_loader = DataLoader(
    val_data,
    batch_size=100,
    shuffle=False
)

## Define LSTM Network

In [29]:
#lstm = torch.nn.LSTM(
#    input_size=4, 
#    hidden_size=5, 
#    batch_first=True
#)

In [30]:
#all_outputs, (final_h, final_c) = lstm.forward(train_data[:2][0])

In [31]:
#final_h[0,:,:], all_outputs[:,-1,:]

In [32]:
class RNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = torch.nn.LSTM(input_size=4, hidden_size=5, batch_first=True)
        self.softmax = torch.nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        all_outputs, (last_hidden, last_cell) = self.lstm.forward(x)
        return self.softmax(all_outputs.sum(-2))

In [34]:
def accuracy(log_prob, category):
    return (log_prob.argmax(dim=-1) == category).sum() / len(category)

## Train

In [35]:
rnn = RNN()

In [37]:
nll = torch.nn.NLLLoss()
optim = torch.optim.Adam(rnn.parameters(), lr=1e-2)
#scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=100, gamma=0.5)

In [38]:
from tqdm.notebook import tqdm

In [48]:
n_epochs = 100
best_accuracy = 0.0

for epoch in tqdm(range(n_epochs)):
    
    # Training
    rnn.train(True)
    for seq, category in train_loader:
        optim.zero_grad()
        result = rnn.forward(seq)
        loss = nll(result, category)
        loss.backward()
        optim.step()
        
    # Validation
    rnn.train(False)
    for seq, category in val_loader:
        result = rnn.forward(seq)
        acc = accuracy(result, category).item()
        print(f"Epoch: {epoch}      "
              f"Accuracy: {100*acc:.0f}%    "
              f"Best Accuracy: {100*best_accuracy:.0f}%",
              end="\r"
             )
        if acc >= best_accuracy:
            best_accuracy = acc
            torch.save(rnn.state_dict(), "best_model.pt")
        
    #scheduler.step()
    
    

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0      Accuracy: 100%    Best Accuracy: 0%

KeyboardInterrupt: 

## Predict

In [49]:
loaded_rnn = RNN()
loaded_rnn.load_state_dict(torch.load("best_model.pt"))

<All keys matched successfully>

In [50]:
prediction = loaded_rnn.forward(parse_bases(test_x)).argmax(dim=-1).detach().cpu().numpy()

In [53]:
# MAKE SURE THAT YOU HAVE THE RIGHT FORMAT
assert prediction.ndim == 1
assert prediction.shape[0] == 250

# AND SAVE EXACTLY AS SHOWN BELOW
np.save('prediction_m2m.npy', prediction.astype(int))

# MAKE SURE THAT THE FILE HAS THE CORRECT FORMAT
def validate_prediction_format():
    loaded = np.load('prediction_m2m.npy')
    assert loaded.shape == (250, )
    assert loaded.dtype == int
    assert (loaded <= 4).all()
    assert (loaded >= 0).all()
validate_prediction_format()

In [52]:
prediction

array([2, 4, 1, 1, 0, 4, 2, 0, 4, 2, 4, 3, 3, 2, 0, 3, 3, 2, 3, 2, 0, 4,
       2, 4, 0, 3, 2, 0, 1, 4, 1, 1, 1, 1, 0, 0, 4, 3, 1, 3, 2, 2, 2, 4,
       3, 4, 1, 0, 1, 0, 1, 2, 4, 4, 3, 0, 0, 4, 4, 2, 1, 2, 3, 0, 3, 1,
       2, 2, 4, 3, 3, 4, 2, 3, 3, 1, 1, 4, 4, 0, 1, 0, 0, 1, 2, 0, 4, 0,
       4, 2, 3, 3, 2, 3, 2, 3, 4, 1, 2, 1, 2, 4, 2, 1, 0, 3, 3, 1, 3, 3,
       0, 1, 1, 0, 4, 4, 2, 0, 1, 4, 2, 0, 4, 2, 3, 2, 4, 0, 1, 0, 2, 4,
       0, 1, 2, 0, 4, 2, 2, 1, 3, 0, 1, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 3,
       3, 4, 4, 4, 2, 1, 1, 0, 3, 1, 1, 1, 2, 2, 1, 3, 4, 4, 1, 3, 1, 3,
       4, 0, 1, 2, 4, 3, 0, 4, 2, 1, 3, 1, 4, 3, 2, 3, 1, 0, 0, 0, 4, 2,
       4, 2, 4, 3, 2, 1, 1, 4, 3, 1, 4, 0, 1, 1, 1, 1, 0, 3, 4, 3, 1, 3,
       4, 3, 1, 3, 1, 0, 2, 4, 2, 3, 0, 4, 4, 3, 0, 2, 3, 3, 3, 3, 0, 4,
       0, 4, 3, 0, 2, 2, 0, 0])

In [54]:
other = np.array([2, 4, 1, 1, 0, 4, 2, 0, 4, 2, 4, 3, 3, 2, 0, 3, 4, 2, 3, 2, 0, 4,
       2, 4, 0, 3, 2, 0, 1, 4, 1, 1, 1, 1, 0, 0, 4, 3, 1, 3, 2, 2, 2, 4,
       3, 4, 1, 0, 1, 0, 1, 2, 4, 4, 3, 0, 0, 4, 4, 2, 1, 2, 3, 2, 3, 1,
       2, 2, 4, 4, 3, 4, 2, 3, 3, 1, 1, 4, 4, 0, 1, 0, 0, 1, 2, 0, 4, 0,
       4, 2, 3, 4, 2, 3, 2, 3, 4, 1, 2, 1, 2, 4, 2, 1, 0, 3, 3, 2, 3, 3,
       0, 1, 1, 0, 4, 4, 2, 0, 1, 4, 2, 0, 4, 2, 3, 2, 4, 0, 1, 0, 2, 4,
       0, 1, 2, 0, 4, 2, 2, 1, 3, 0, 1, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 3,
       3, 4, 4, 4, 2, 1, 1, 0, 3, 1, 1, 1, 2, 2, 1, 3, 4, 4, 1, 4, 1, 3,
       4, 0, 1, 2, 4, 3, 0, 4, 2, 1, 3, 1, 4, 3, 2, 3, 1, 0, 0, 0, 4, 2,
       4, 2, 4, 3, 2, 1, 1, 4, 3, 1, 4, 0, 1, 1, 1, 1, 0, 3, 4, 3, 1, 3,
       4, 3, 1, 3, 1, 0, 2, 4, 2, 3, 0, 4, 4, 3, 0, 2, 3, 3, 3, 4, 0, 4,
       0, 4, 3, 0, 2, 2, 0, 0])

In [56]:
ratio = (prediction == other).sum() / len(other)
print(f"Agreement: {100*ratio:.0f}%")

Agreement: 97%
