# CpGs Detector

Here we have a simple problem, given a DNA sequence (of N, A, C, G, T), count the number of CpGs in the sequence (consecutive CGs).

We have defined a few helper functions / parameters for performing this task.

We need you to build a LSTM model and train it to complish this task in PyTorch.

A good solution will be a model that can be trained, with high confidence in correctness.

In [7]:
import random
import numpy as np
import torch
from typing import Sequence
from functools import partial

In [8]:
def set_seed(seed=13):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(13)

# Use this for getting x label
def rand_sequence(n_seqs: int, seq_len: int=128) -> Sequence[int]:
    for i in range(n_seqs):
        yield [random.randint(0, 4) for _ in range(seq_len)]

# Use this for getting y label
def count_cpgs(seq: str) -> int:
    cgs = 0
    for i in range(0, len(seq) - 1):
        dimer = seq[i:i+2]
        # note that seq is a string, not a list
        if dimer == "CG":
            cgs += 1
    return cgs

# Alphabet helpers
alphabet = 'NACGT'
dna2int = { a: i for a, i in zip(alphabet, range(5))}
int2dna = { i: a for a, i in zip(alphabet, range(5))}

intseq_to_dnaseq = partial(map, int2dna.get)
dnaseq_to_intseq = partial(map, dna2int.get)

In [9]:
# we prepared two datasets for training and evaluation
# training data scale we set to 2048
# we test on 512

def prepare_data(num_samples=100):
    # prepared the training and test data
    # you need to call rand_sequence and count_cpgs here to create the dataset
    # step 1
    X_dna_seqs_train = list(rand_sequence(num_samples))
    temp =  [list(intseq_to_dnaseq(i)) for i in X_dna_seqs_train] #
    y_dna_seqs =  [count_cpgs("".join(i)) for i in temp]
    return X_dna_seqs_train, y_dna_seqs

train_x, train_y = prepare_data(2048)
test_x, test_y = prepare_data(512)

In [11]:
batch_size = 32

In [12]:
# create data loader
from torch.utils.data import DataLoader, TensorDataset

def convert_to_tensor(data, dtype=torch.long):
    return torch.tensor(data, dtype=dtype)

train_x_tensor = convert_to_tensor(train_x)
train_y_tensor = convert_to_tensor(train_y)
test_x_tensor = convert_to_tensor(test_x)
test_y_tensor = convert_to_tensor(test_y)

# Prepare DataLoader
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)



In [15]:
class CpGPredictor(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob=0.5):
        super(CpGPredictor, self).__init__()
        self.embedding = torch.nn.Embedding(5, input_dim)
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.classifier = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, hidden = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]
        logits = self.classifier(lstm_out)
        return logits
    


In [24]:
input_dim = 128
hidden_dim = 128
layer_dim = 1
output_dim = 1
learning_rate = 0.001

## Check if gpu is available
cuda_or_cpu = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(cuda_or_cpu)
print(f"Using device: {device}")

model = CpGPredictor(input_dim, hidden_dim, layer_dim, output_dim)
model = model.to(device) # Transfer the model to GPU/CPU

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Using device: cpu


In [25]:
def train_model(model, train_data_loader, loss_fn, epochs=100):
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_data_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")     

train_model(model, train_data_loader, loss_fn, epochs=10)

Epoch 1/10, Loss: 6.772491931915283
Epoch 2/10, Loss: 4.583024978637695
Epoch 3/10, Loss: 3.4164929389953613
Epoch 4/10, Loss: 4.15935754776001
Epoch 5/10, Loss: 3.899125099182129
Epoch 6/10, Loss: 1.675019383430481
Epoch 7/10, Loss: 0.2483929544687271
Epoch 8/10, Loss: 0.8165625333786011
Epoch 9/10, Loss: 0.3686717450618744
Epoch 10/10, Loss: 0.2534410357475281


In [19]:
def evaluate_model(model, test_data_loader, loss_fn):
    total_loss = 0
    for inputs, labels in test_data_loader:
        outputs = model(inputs)
        loss = loss_fn(outputs.squeeze(), labels.float())
        total_loss += loss.item()

    avg_loss = total_loss/len(test_data_loader)
    print(f"Average loss on test data: {avg_loss}")

test_dataset = TensorDataset(test_x_tensor, test_y_tensor)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
evaluate_model(model, test_data_loader, loss_fn)

Average loss on test data: 4.090817257761955


In [21]:
# Testing wth examples (Should be of te same size)
# To use examples of different size, we need to use padding

examples = ["NCACANNTNCGGAGGCGNANCACANNTNCGGAGGCGNA", "NCACANNTNCGGAGGCGCGNCACANNTNCGGAGGCGCG"]

def get_actual_count(examples):
  return [count_cpgs("".join(i)) for i in examples]

def encode_to_integer(examples):
  int_sequence = [list(dnaseq_to_intseq(i)) for i in examples]
  return int_sequence

def transform_examples(data):
  int_sequence = encode_to_integer(data)
  test_sequnce = convert_to_tensor(int_sequence)
  return test_sequnce

def predict(model, unseen_data):
  with torch.no_grad():
    model.eval()
    inputs = transform_examples(unseen_data)
    inputs = inputs.to(device)
    outputs = model(inputs)
    predictions = outputs.squeeze().cpu().numpy()
  return predictions

predictions = predict(model, examples)
actual_vals = get_actual_count(examples)

for i in range(len(examples)):
  print(f"Sequence: {examples[i]}, Predicted CpGs: {round(predictions[i])}, Actual CpGs: {actual_vals[i]}")

Sequence: NCACANNTNCGGAGGCGNANCACANNTNCGGAGGCGNA, Predicted CpGs: 5, Actual CpGs: 4
Sequence: NCACANNTNCGGAGGCGCGNCACANNTNCGGAGGCGCG, Predicted CpGs: 6, Actual CpGs: 6


Using device: cpu
