In [56]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset 
from torch.nn.utils.rnn import pad_sequence
import numpy as np

In [38]:
train_data ='train_features.csv'
test_data ='test_features.csv'
labels = 'train_labels.csv'

In [64]:
def load_and_preprocess_data(filename):
    df = pd.read_csv(filename)
    df['data'] = df['data'].str.replace('[^ACGT]', 'N')
    char_to_int = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 0}
    
    data_array = []
    
    for data_str in df['data']:
        sequence = []
        for char in data_str:
            try:
                sequence.append(char_to_int[char])
            except KeyError: # There are some unexpected characters in the data
                continue
                
        data_array.append(torch.tensor(sequence))
    padded_sequences = pad_sequence(data_array, batch_first=True)
            
    return padded_sequences

In [69]:
train_tensor = load_and_preprocess_data(train_data)
test_tensor = load_and_preprocess_data(test_data)

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3

In [86]:
train_tensor.shape

torch.Size([16969, 1058])

In [87]:
test_tensor = F.pad(test_tensor, (0, 196), 'constant', 0)

In [88]:
test_tensor.shape

torch.Size([4243, 1058])

In [91]:
def one_hot_encode(sequence, num_classes=5):
    # One-hot encode
    return F.one_hot(sequence, num_classes=num_classes).float()

In [97]:
train_one_hot_encoded = one_hot_encode(train_tensor)
test_one_hot_encoded = one_hot_encode(test_tensor)

In [190]:
torch.flatten(train_one_hot_encoded, start_dim=1).shape

torch.Size([16969, 5290])

In [98]:
print(train_one_hot_encoded.shape)
print(test_one_hot_encoded.shape)

torch.Size([16969, 1058, 5])
torch.Size([4243, 1058, 5])


## Dataset

In [109]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return torch.tensor(sequence, dtype=torch.float), torch.tensor(label, dtype=torch.long)

In [102]:
labels_df = pd.read_csv('train_labels.csv')

In [152]:
num_classes = len(labels_df['labels'])

In [154]:
num_classes

16969

## Model

In [215]:
class CNN1D(nn.Module):
    def __init__(self, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=5, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=5, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(5290, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        # Assuming the sequences are fixed length, we flatten the output for the fully connected layer
        x = torch.flatten(x, start_dim=1)
        # print(x.shape)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return self.softmax(x)

In [216]:
model = CNN1D(num_classes=num_classes)
dataset = SequenceDataset(train_one_hot_encoded, labels_df['labels'].values)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Loss and optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

## Training Loop

In [217]:
num_epochs = 100
for epoch in range(num_epochs):
    for sequences, labels in dataloader:
        # Forward pass 
        outputs = model(sequences.permute(0, 2, 1))
        # print(labels.shape)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

  return torch.tensor(sequence, dtype=torch.float), torch.tensor(label, dtype=torch.long)


RuntimeError: Given groups=1, weight of size [16, 5, 5], expected input[32, 1058, 5] to have 5 channels, but got 1058 channels instead