In [1]:
# Importing modules
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader

In [2]:
# Custom dataset
class datasets:
    
    def __init__(self, data):
        self.items = data.to_numpy()
        self.length = data.shape[0]

    @staticmethod
    def OneHotEncoder(seqArray):
        seq_lis = list(seqArray)
        mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N':4}
        int_seq = [mapping[base] for base in seq_lis]
        encoded_seq = (np.eye(len(mapping))[int_seq]).flatten()
        return encoded_seq
        
    def __getitem__(self, index):
        seq, label = self.items[index]
        seq = self.OneHotEncoder(seq)
        seq_tensor, label_tensor = torch.tensor(seq).float(), torch.tensor(label)
        return seq_tensor, label_tensor

    def __len__(self):
        return self.length

In [3]:
# AutoEncoder Model
class AutoEncoder(nn.Module):
    
    def __init__(self):
        super(AutoEncoder, self).__init__()
        
        # Bottleneck layer
        self.hidden = 50
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(755, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, self.hidden),
            nn.ReLU()
        )
        
        # Attention
        self.attention = nn.Sequential(
            nn.Linear(self.hidden, self.hidden),
            nn.Softmax(dim=1)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(self.hidden, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 755),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded_x = self.encoder(x)
        attention_wt = self.attention(encoded_x)
        decoded_x = self.decoder(encoded_x*attention_wt)
        return encoded_x, decoded_x

In [4]:
# Reading data
data = pd.read_csv("sequence_data.csv", sep=",")

# Dataframe
data

Unnamed: 0,sequence,class
0,AATGTACAGTATTGCGTTTTGGAAAGAGTCTGGATTTTTAGGGCTC...,1
1,AATGTACAGTATTGCGTTTTGCCTCCACCTCATTCCAGGCCTAAGA...,1
2,CACCGGTGGGAGATTGGAGTCCTAGCCCGACTCGCCGGGCAGAGCG...,1
3,AATGTACAGTATTGCGTTTTGCTGTGCCAGGGACCTTACCTTATAC...,1
4,TTAATCGCGTCCATTGAAGTCCTCTACCGTGCAGCTCATCACGCAG...,1
...,...,...
99995,AATGTACAGTATTGCGTTTTGGACTGTTTGGGAGTTGATGACCTTT...,1
99996,AATGTACAGTATTGCGTTTTGCCAGCTGCTCAGGAGTCATGCTTAG...,1
99997,ATTGTGAATTAAATTGGAGTCCTGCCATCGGAACTGCTGTCTGCAT...,1
99998,AATGTACAGTATTGCGTTTTGAGAGTAAAGTAGATGATGGAAATAT...,1


In [5]:
# Training dataset
train_datasets = datasets(data)

# Training dataloader
train_dataloader = DataLoader(train_datasets, batch_size=128)

# Device
device = torch.device("cpu")

# Model
model = AutoEncoder()
model = model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr = 0.001)

# Loss function
criterion = nn.MSELoss()

In [6]:
# Training loop
for epoch in range(10):
    final_loss = 0
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        X, _ = batch
        X = X.to(device)
        X_encoded, X_decoded = model(X)
        loss = criterion(X_decoded, X)
        loss.backward()
        optimizer.step()
        final_loss += loss.item()
    print(f"Epoch: {epoch+1} | Loss: {final_loss/len(train_dataloader)}")

Epoch: 1 | Loss: 0.12391351870334971
Epoch: 2 | Loss: 0.0871530064688924
Epoch: 3 | Loss: 0.07850154622188767
Epoch: 4 | Loss: 0.07664421450375292
Epoch: 5 | Loss: 0.07507425736721672
Epoch: 6 | Loss: 0.07455064782210628
Epoch: 7 | Loss: 0.07312488348682976
Epoch: 8 | Loss: 0.07189612225879488
Epoch: 9 | Loss: 0.07111929995400826
Epoch: 10 | Loss: 0.07008293082418345
