In [1]:
# Importing modules
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader

In [2]:
# Custom dataset
class datasets:
    
    def __init__(self, data):
        self.items = data.to_numpy()
        self.length = seq_data.shape[0]

    @staticmethod
    def OneHotEncoder(seqArray):
        seq_lis = list(seqArray)
        mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N':4}
        int_seq = [mapping[base] for base in seq_lis]
        encoded_seq = (np.eye(len(mapping))[int_seq]).flatten()
        return encoded_seq
        
    def __getitem__(self, index):
        seq, label = self.items[index]
        seq = self.OneHotEncoder(seq)
        seq_tensor, label_tensor = torch.tensor(seq).float(), torch.tensor(label)
        return seq_tensor.unsqueeze(0), label_tensor

    def __len__(self):
        return self.length

In [3]:
# 1D CNN AutoEncoder
class CNN1dAutoEncoder(nn.Module):
    def __init__(self):
        super(CNN1dAutoEncoder, self).__init__()

        # Bottleneck
        self.hidden = 100

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 4, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=5),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(4*151, self.hidden),
            nn.ReLU()
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden, 151*4),
            nn.ReLU(),
            nn.Unflatten(1, (4, 151)),
            nn.ConvTranspose1d(4, 1, kernel_size=5, stride=5),
            nn.Sigmoid()
            
        )

    # Forward pass
    def forward(self, x):
        encoded_x = self.encoder(x)
        decoded_x = self.decoder(encoded_x)
        return encoded_x, decoded_x

In [4]:
# Data
seq_data = pd.read_csv("sequence_data.csv")

seq_data

Unnamed: 0,sequence,class
0,AATGTACAGTATTGCGTTTTGGAAAGAGTCTGGATTTTTAGGGCTC...,1
1,AATGTACAGTATTGCGTTTTGCCTCCACCTCATTCCAGGCCTAAGA...,1
2,CACCGGTGGGAGATTGGAGTCCTAGCCCGACTCGCCGGGCAGAGCG...,1
3,AATGTACAGTATTGCGTTTTGCTGTGCCAGGGACCTTACCTTATAC...,1
4,TTAATCGCGTCCATTGAAGTCCTCTACCGTGCAGCTCATCACGCAG...,1
...,...,...
99995,AATGTACAGTATTGCGTTTTGGACTGTTTGGGAGTTGATGACCTTT...,1
99996,AATGTACAGTATTGCGTTTTGCCAGCTGCTCAGGAGTCATGCTTAG...,1
99997,ATTGTGAATTAAATTGGAGTCCTGCCATCGGAACTGCTGTCTGCAT...,1
99998,AATGTACAGTATTGCGTTTTGAGAGTAAAGTAGATGATGGAAATAT...,1


In [5]:
# Dataset
train_datasets = datasets(seq_data)

# Dataloader
train_dataloader = DataLoader(train_datasets, batch_size=64)

# Device
device = torch.device("cpu")

# Model 
model = CNN1dAutoEncoder()
model = model.to(device)

# Optimizer 
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Loss function
criterion = nn.MSELoss()

In [6]:
# Training loop
for epoch in range(10):
    final_loss = 0
    #model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        X, _ = batch
        X = X.to(device)
        X_encoded, X_decoded = model(X)
        loss = criterion(X_decoded, X)
        loss.backward()
        optimizer.step()
        final_loss += loss.item()
    print(f"Epoch: {epoch+1} | Loss: {final_loss/len(train_dataloader)}")

Epoch: 1 | Loss: 0.15962200187103762
Epoch: 2 | Loss: 0.12071478009815027
Epoch: 3 | Loss: 0.11179393361145612
Epoch: 4 | Loss: 0.10786414622631274
Epoch: 5 | Loss: 0.10629736199279054
Epoch: 6 | Loss: 0.10544891859466116
Epoch: 7 | Loss: 0.1048283058904488
Epoch: 8 | Loss: 0.10426931917400445
Epoch: 9 | Loss: 0.10388430528299822
Epoch: 10 | Loss: 0.10344692425932246
