In [5]:
import random
import pandas as pd

# Function to generate random DNA sequence of a given length
def generate_dna_sequence(length=50):
    bases = ['A', 'T', 'C', 'G']
    return ''.join(random.choices(bases, k=length))

# Function to generate synthetic dataset of DNA sequences
def generate_dna_dataset(num_sequences=100000, sequence_length=50):
    dataset = []
    for _ in range(num_sequences):
        clean_seq = generate_dna_sequence(sequence_length)
        noisy_seq = generate_dna_sequence(sequence_length)
        dataset.append([clean_seq, noisy_seq])
    
    # Create a DataFrame
    df = pd.DataFrame(dataset, columns=['Clean', 'Noisy'])
    return df

# Generate dataset of 100,000 DNA sequences
num_sequences = 10000
sequence_length = 50  # Length of each DNA sequence
dna_dataset = generate_dna_dataset(num_sequences, sequence_length)

# Save the dataset to CSV
dna_dataset.to_csv('synthetic_dna_dataset.csv', index=False)
print(f"Dataset of {num_sequences} sequences generated and saved as 'synthetic_dna_dataset.csv'")


Dataset of 10000 sequences generated and saved as 'synthetic_dna_dataset.csv'


In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
     ---------------------------------------- 11.6/11.6 MB 3.4 MB/s eta 0:00:00
Collecting numpy>=1.23.2
  Downloading numpy-2.2.2-cp311-cp311-win_amd64.whl (12.9 MB)
     ---------------------------------------- 12.9/12.9 MB 3.7 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
     -------------------------------------- 508.0/508.0 kB 3.2 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
     -------------------------------------- 346.6/346.6 kB 2.1 MB/s eta 0:00:00
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.2.2 pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# Positional Encoding (for the Transformer)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

# Transformer VAE Model
class TransformerVAE(nn.Module):
    def __init__(self, input_dim, latent_dim, hidden_dim, num_heads, num_layers):
        super(TransformerVAE, self).__init__()

        # Embedding and Positional Encoding
        self.embedding = nn.Embedding(4, hidden_dim)  # 4 DNA bases (A, T, C, G)
        self.positional_encoding = PositionalEncoding(hidden_dim)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Latent space mapping (ensure dimensions are correct for the flattened encoder output)
        self.to_latent_mu = nn.Linear(hidden_dim * input_dim, latent_dim)
        self.to_latent_logvar = nn.Linear(hidden_dim * input_dim, latent_dim)

        # Latent to hidden dimension for decoder (after reshaping)
        self.latent_to_hidden = nn.Linear(latent_dim, hidden_dim * input_dim)

        # Transformer Decoder
        decoder_layer = nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        # Output layer
        self.output_layer = nn.Linear(hidden_dim, 4)  # 4 DNA classes

    def forward(self, x):
        # Input embedding
        embedded = self.embedding(x.long())  # Shape: [batch_size, seq_length, hidden_dim]
        embedded = self.positional_encoding(embedded)

        # Encoding
        encoded = self.encoder(embedded)  # Shape: [batch_size, seq_length, hidden_dim]

        # Flatten for latent space (ensure the correct dimension matching)
        encoded_flat = encoded.view(x.size(0), -1)  # Flatten to [batch_size, hidden_dim * seq_length]

        # Latent space (mean and log-variance)
        mu = self.to_latent_mu(encoded_flat)
        log_var = self.to_latent_logvar(encoded_flat)
        std = torch.exp(0.5 * log_var)
        z = mu + std * torch.randn_like(std)  # Reparameterization trick

        # Latent to hidden dimension (reshaped for decoder)
        hidden = self.latent_to_hidden(z).view(x.size(0), x.size(1), -1)

        # Decoding
        decoded = self.decoder(hidden, encoded)
        output = self.output_layer(decoded)  # Predict probabilities for 4 classes

        # Apply softmax to get the probabilities
        return nn.Softmax(dim=-1)(output), mu, log_var


# Function to convert DNA sequence to numeric values
def dna_to_numeric(dna_sequence):
    mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
    return np.array([mapping[base] for base in dna_sequence])


# Function to calculate accuracy
def calculate_accuracy(predicted, target):
    correct_predictions = (predicted == target).sum().item()  # Count the correct bases
    accuracy = correct_predictions / target.size(0)  # Divide by the total number of bases
    return accuracy


# Function to load synthetic dataset
def load_synthetic_dataset(filepath):
    df = pd.read_csv(filepath)
    clean_sequences = df['Clean'].values
    noisy_sequences = df['Noisy'].values
    clean_sequences = [dna_to_numeric(seq) for seq in clean_sequences]
    noisy_sequences = [dna_to_numeric(seq) for seq in noisy_sequences]

    return torch.tensor(np.array(clean_sequences)), torch.tensor(np.array(noisy_sequences))


# Model Training Code
def train_model(model, train_data, targets, optimizer, criterion):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    output, mu, log_var = model(train_data)

    # Calculate loss (using reconstruction loss + KL divergence)
    recon_loss = criterion(output.view(-1, 4), targets.view(-1))
    kl_divergence = -0.5 * torch.mean(1 + log_var - mu.pow(2) - log_var.exp())

    loss = recon_loss + kl_divergence

    # Backward pass
    loss.backward()
    optimizer.step()

    return loss.item()


# Model Evaluation Code
def evaluate_model(model, test_data, targets):
    model.eval()

    with torch.no_grad():
        output, mu, log_var = model(test_data)

        # Calculate accuracy
        predicted_seq = torch.argmax(output, dim=-1)
        accuracy = calculate_accuracy(predicted_seq, targets)
        return accuracy


# Example usage (using synthetic dataset)
train_data, train_labels = load_synthetic_dataset('synthetic_dna_dataset.csv')  # Load the dataset

# Hyperparameters
latent_dim = 32
hidden_dim = 64
num_heads = 8
num_layers = 3
learning_rate = 0.001
batch_size = 64
epochs = 10

# Initialize the model, optimizer, and loss function
model = TransformerVAE(input_dim=train_data.shape[1], latent_dim=latent_dim, hidden_dim=hidden_dim, num_heads=num_heads, num_layers=num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(epochs):
    epoch_loss = 0
    for i in range(0, len(train_data), batch_size):
        batch_data = train_data[i:i+batch_size]
        batch_labels = train_labels[i:i+batch_size]

        loss = train_model(model, batch_data, batch_labels, optimizer, criterion)
        epoch_loss += loss

    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_data)}")

    # Evaluate Model (example after each epoch)
    accuracy = evaluate_model(model, train_data, train_labels)
    print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1, Loss: 0.02308432354927063
Accuracy: 1258.49%
Epoch 2, Loss: 0.021901263928413392
Accuracy: 1258.49%
Epoch 3, Loss: 0.021827258455753326
Accuracy: 1258.49%
Epoch 4, Loss: 0.021799881076812744
Accuracy: 1258.49%
Epoch 5, Loss: 0.02179001553058624
Accuracy: 1258.49%
Epoch 6, Loss: 0.021784262883663176
Accuracy: 1258.49%
Epoch 7, Loss: 0.021780766475200653
Accuracy: 1258.49%
Epoch 8, Loss: 0.021777846717834472
Accuracy: 1258.49%
Epoch 9, Loss: 0.021775509226322175
Accuracy: 1258.49%
Epoch 10, Loss: 0.021774122273921965
Accuracy: 1258.49%


In [5]:
pip install torch

Collecting torch
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
Collecting filelock
  Downloading filelock-3.16.1-py3-none-any.whl (16 kB)
Collecting networkx
  Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 3.3 MB/s eta 0:00:00
Collecting jinja2
  Downloading jinja2-3.1.5-py3-none-any.whl (134 kB)
     -------------------------------------- 134.6/134.6 kB 4.0 MB/s eta 0:00:00
Collecting fsspec
  Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
     -------------------------------------- 183.9/183.9 kB 3.7 MB/s eta 0:00:00
Collecting sympy==1.13.1
  Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
     ---------------------------------------- 6.2/6.2 MB 4.0 MB/s eta 0:00:00
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     -------------------------------------- 536.2/536.2 kB 4.8 MB/s eta 0:00:00
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp


[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
