In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import pandas as pd
print(torch.__version__)
print("CUDA available?" if torch.cuda.is_available() else "Using CPU")

2.7.1
Using CPU


In [3]:
# Transform: normalize MNIST to [0,1]
#this next line converts the MNIST images to tensors
#and normalizes them to the range [0, 1]
transform = transforms.ToTensor()

# Download MNIST
#train_data is the training set, test_data is the test set
#both are downloaded from the 'data' directory

train_data = datasets.MNIST(root='data', train=True, download=True, transform=transform)
test_data  = datasets.MNIST(root='data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=64, shuffle=False)

# Define a simple 2-layer neural net
# this is a feedforward neural network 
model = nn.Sequential(
    nn.Flatten(),# flatten the image to a 1D tensor 28x28 → 784
    nn.Linear(784, 128), # first layer: 784 inputs, 128 outputs
    nn.ReLU(), #activation function that introduces non-linearity
    nn.Linear(128, 10) # second layer: 128 inputs, 10 outputs (one for each digit 0-9)
)

# Loss and optimizer
loss_fn = nn.CrossEntropyLoss() #specifies the loss function 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #defines the optimizer to update the model's parameters

# Training loop
#makes 5 full passes over the training data
#for each epoch, it iterates over the training data, where X are images and y are labels
# loss computes the loss between the predicted outputs and the true labels
# optimizer.zero_grad() resets the gradients to zero before backpropagation
# loss.backward() computes the gradients of the loss with respect to the model parameters
# optimizer.step() updates the model parameters based on the computed gradients
for epoch in range(5):
    for X, y in train_loader:
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}: loss = {loss.item():.4f}")

Epoch 1: loss = 0.0848
Epoch 2: loss = 0.0326
Epoch 3: loss = 0.0269
Epoch 4: loss = 0.1157
Epoch 5: loss = 0.0028


In [4]:
correct = 0
total = 0

with torch.no_grad():
    for X, y in test_loader:
        pred = model(X)
        predicted = pred.argmax(1)
        correct += (predicted == y).sum().item()
        total += y.size(0)

print(f"Test Accuracy: {correct / total:.2%}")

Test Accuracy: 97.50%


In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader



data_df = pd.read_csv('data/embeddings/mutation_embeddings_esm2_t30_150M_UR50D.csv', index_col=0)
X = data_df.drop(columns=['ID', 'Effect'])
y = data_df['Effect']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  # or torch.long for classification
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)    # or torch.long for classification


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

(856, 640) (215, 640)


  data_df = pd.read_csv('data/embeddings/mutation_embeddings_esm2_t30_150M_UR50D.csv', index_col=0)


In [16]:
model = nn.Sequential(
    nn.Linear(640, 128), # first layer: 640 inputs, 128 outputs
    nn.ReLU(), #activation function that introduces non-linearity
    nn.Linear(128, 1) # second layer: 128 inputs, 1 output
)

# Loss and optimizer
loss_fn = nn.MSELoss() #specifies the loss function 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #defines the optimizer to update the model's parameters

# Training loop
#makes 5 full passes over the training data
#for each epoch, it iterates over the training data, where X are images and y are labels
# loss computes the loss between the predicted outputs and the true labels
# optimizer.zero_grad() resets the gradients to zero before backpropagation
# loss.backward() computes the gradients of the loss with respect to the model parameters
# optimizer.step() updates the model parameters based on the computed gradients
for epoch in range(5):
    for X, y in train_loader:
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}: loss = {loss.item():.4f}")

Epoch 1: loss = 0.3221
Epoch 2: loss = 0.1778
Epoch 3: loss = 0.3842
Epoch 4: loss = 0.1770
Epoch 5: loss = 0.1356


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [20]:
import numpy as np
mse = 0
total = 0
y_true = []
y_pred = []

with torch.no_grad():
    for X, y in test_loader:
        pred = model(X).squeeze()
        mse += ((pred - y) ** 2).sum().item()
        total += y.size(0)
        y_true.extend(y.cpu().numpy())
        y_pred.extend(pred.cpu().numpy())

print(f"Test MSE: {mse / total:.4f}")

# Calculate R²
y_true = np.array(y_true)
y_pred = np.array(y_pred)
ss_res = ((y_true - y_pred) ** 2).sum()
ss_tot = ((y_true - y_true.mean()) ** 2).sum()
r2 = 1 - ss_res / ss_tot
print(f"Test R²: {r2:.4f}")

Test MSE: 0.2288
Test R²: -0.0864
