In [5]:
from typing import Sequence
from functools import partial
import random
import torch
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd

In [6]:
# hint we will need following imports
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [7]:
# DO NOT CHANGE HERE
seed_value = 13
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed(seed_value)

# Use this for getting x label
def rand_sequence_var_len(n_seqs: int, lb: int=16, ub: int=128) -> Sequence[int]:
    for i in range(n_seqs):
        seq_len = random.randint(lb, ub)
        yield [random.randint(1, 5) for _ in range(seq_len)]


# Use this for getting y label
def count_cpgs(seq: str) -> int:
    cgs = 0
    for i in range(0, len(seq) - 1):
        dimer = seq[i:i+2]
        # note that seq is a string, not a list
        if dimer == "CG":
            cgs += 1
    return cgs


# Alphabet helpers   
alphabet = 'NACGT'
dna2int = {a: i for a, i in zip(alphabet, range(1, 6))}
int2dna = {i: a for a, i in zip(alphabet, range(1, 6))}
dna2int.update({"pad": 0})
int2dna.update({0: "pad"})

intseq_to_dnaseq = partial(map, int2dna.get)
dnaseq_to_intseq = partial(map, dna2int.get)

In [9]:
# TODO complete the task based on the change
def prepare_data(num_samples=100, min_len=16, max_len=128):
    # TODO prepared the training and test data
    # you need to call rand_sequence and count_cpgs here to create the dataset
    #step 1
    X_dna_seqs_train = list(rand_sequence_var_len(num_samples, min_len, max_len))
    #step 2
    temp = [list(intseq_to_dnaseq(seq)) for seq in X_dna_seqs_train] # use intseq_to_dnaseq here to convert ids back to DNA seqs
    #step3
    y_dna_seqs = [count_cpgs("".join(seq)) for seq in temp] # use count_cpgs here to generate labels with temp generated in step2
    return X_dna_seqs_train, y_dna_seqs
    
    
min_len, max_len = 64, 128
train_x, train_y = prepare_data(2048, min_len, max_len)
test_x, test_y = prepare_data(512, min_len, max_len)
pop_index = 1386
train_x.pop(1386)
train_y.pop(1386)

classes = np.unique(train_y)
num_classes = len(np.unique(train_y))

In [10]:
classes

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [11]:
# some config
LSTM_HIDDEN = 64
LSTM_LAYER = 1
batch_size = 4
learning_rate = 1e-3
epoch_num = 100

In [12]:
class DnaDataset(Dataset):
    def __init__(self, x, y) -> None:
        super().__init__()
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
def custom_collate(batch):
    data, targets = zip(*batch)
    targets = [torch.tensor(x) for x in targets]
    seqs = [torch.Tensor(x) for x in data]
    seqs[0] = nn.ConstantPad1d((0, max_len - seqs[0].shape[0]), 0)(seqs[0])
    # Pad sequences dynamically
    seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0)
    return seqs, torch.stack(targets)

train_data = DnaDataset(train_x, train_y)
train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

test_data = DnaDataset(test_x, test_y)
test_data_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

In [13]:
# Model
class CpGPredictor(torch.nn.Module):
    ''' Simple model that uses a LSTM to count the number of CpGs in a sequence '''
    def __init__(self):
        super(CpGPredictor, self).__init__()
        # TODO complete model, you are free to add whatever layers you need here
        # We do need a lstm and a classifier layer here but you are free to implement them in your way
        self.lstm = nn.LSTM(input_size=128, hidden_size=LSTM_HIDDEN)
        # self.fc = nn.Linear(LSTM_HIDDEN, 32)
        self.classifier = nn.Linear(LSTM_HIDDEN, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # TODO complete forward function
        x, _ = self.lstm(x)
        logits = self.classifier(x)
        logits = self.softmax(logits)
        return logits

In [14]:
# init model / loss function / optimizer etc.
model = CpGPredictor()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [15]:
epochs = 400
for epoch in range(epochs):
  running_loss = 0.0
  for i, data in enumerate(train_data_loader):
    inputs, labels = data
    # labels = torch.argmax(labels, dim=1)
    # forward propagation
    outputs = model(inputs)
    # outputs = torch.tensor([torch.argmax(o) for o in outputs])
    loss = loss_fn(outputs, labels)
    # set optimizer to zero grad
    # to remove previous epoch gradients
    optimizer.zero_grad()
    # backward propagation
    loss.backward()
    # optimize
    optimizer.step()
    running_loss += loss.item()
  # display statistics
  if not ((epoch + 1) % (epochs // 10)):
    print(f'Epochs:{epoch + 1:5d} | ' \
          f'Batches per epoch: {i + 1:3d} | ' \
          f'Loss: {running_loss / (i + 1):.10f}')

Epochs:   40 | Batches per epoch: 512 | Loss: 2.2464367663
Epochs:   80 | Batches per epoch: 512 | Loss: 2.1031293173
Epochs:  120 | Batches per epoch: 512 | Loss: 1.9948719332
Epochs:  160 | Batches per epoch: 512 | Loss: 1.9537221873
Epochs:  200 | Batches per epoch: 512 | Loss: 1.9728409878
Epochs:  240 | Batches per epoch: 512 | Loss: 1.9305884615
Epochs:  280 | Batches per epoch: 512 | Loss: 1.8602278507
Epochs:  320 | Batches per epoch: 512 | Loss: 1.7887364433
Epochs:  360 | Batches per epoch: 512 | Loss: 1.7753624846
Epochs:  400 | Batches per epoch: 512 | Loss: 1.7702160720


In [16]:
model.eval()

res_gs = []
res_pred = []

with torch.no_grad():
  loss = 0
  for i, (inputs, labels) in enumerate(test_data_loader):
    # calculate output by running through the network
    predictions = model(inputs)
    res_pred.extend(predictions)
    # labels = torch.argmax(labels, dim=1)
    res_gs.extend(labels)
    loss += loss_fn(predictions, labels)
  print(f'Loss: {loss / (i + 1)}')

Loss: 2.441842555999756


## Live test

In [25]:
# sample = list(rand_sequence(128))[0]
classes = np.unique(train_y)
sample = test_x[10]
gt = "".join([int2dna.get(i) for i in sample]).count("CG")
sample = torch.Tensor(sample)
sample = nn.ConstantPad1d((0, max_len - sample.shape[0]), 0)(sample)
pred_logit = model(sample.unsqueeze(0))
pred = torch.argmax(pred_logit)
# conf = float(F.softmax(pred_logit, dim=1)[0][int(pred)]*100)
print(gt, classes[pred])

4 4


In [18]:
torch.save(model.state_dict() ,"128dim_padding_model.pt")

In [19]:
model_128_dim = CpGPredictor()
model_128_dim.load_state_dict(torch.load("128dim_padding_model.pt"))

<All keys matched successfully>

In [26]:
import json

with open("128_padded_dim_config.json", "w") as f:
    f.write(json.dumps(
        {
            "alphabet": alphabet,
            "dna2int" : dna2int,
            "int2dna":int2dna,
            "classes":classes.tolist(),
            "num_classes": int(num_classes)
        }

    ))