# Generating protein_embeddings.parquet
The following cells were used to first preprocess the train.csv file to create the mutated sequences. Then, we run the ESM model to get the embeddings and store these in a parquet file for later use.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')

# get the sequence
seq = open('sequence.fasta', 'r').read()
seq = seq.split("\n")[1]

# create each mutated sequence using the info
sequences = []
for i in df['mutant']:
    ind = int(i[1:-1])
    tmp = seq[:ind] + i[-1] + seq[ind+1:]
    sequences.append(tmp)
df['Sequence'] = sequences

Unnamed: 0,mutant,DMS_score
0,M0Y,0.273
1,M0W,0.2857
2,M0V,0.2153
3,M0T,0.3122
4,M0S,0.218


In [38]:
import torch
import esm

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained ESM model and move it to GPU
esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # Example: ESM-2 model
batch_converter = alphabet.get_batch_converter()
esm_model = esm_model.to(device)
esm_model.eval()  # Set to eval mode

# Extract ESM embeddings using GPU
def extract_esm_embedding(sequence):
    batch_labels, batch_strs, batch_tokens = batch_converter([(None, sequence)])
    batch_tokens = batch_tokens.to(device)  # Move input to GPU

    with torch.no_grad():
        results = esm_model(batch_tokens, repr_layers=[33], return_contacts=False)
    
    token_representations = results["representations"][33]  # Use final layer
    sequence_embedding = token_representations.mean(dim=1).squeeze().cpu().numpy()  # Move back to CPU for NumPy
    return sequence_embedding

Using device: cuda
Epoch 10/100, Loss: 0.0505
Epoch 20/100, Loss: 0.0431
Epoch 30/100, Loss: 0.0419
Epoch 40/100, Loss: 0.0414
Epoch 50/100, Loss: 0.0412
Epoch 60/100, Loss: 0.0412
Epoch 70/100, Loss: 0.0412
Epoch 80/100, Loss: 0.0412
Epoch 90/100, Loss: 0.0412
Epoch 100/100, Loss: 0.0412
Test MSE: 0.0497


In [None]:
# Apply embedding extraction
df["Embedding"] = df["Sequence"].apply(lambda seq: extract_esm_embedding(seq))

df.to_parquet("protein_embeddings.parquet", engine="pyarrow")

# Training Models
We initially train a MLP, LightGBM, and XGBoost.

In [40]:
df = pd.read_parquet('protein_embeddings.parquet')

In [55]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X = np.vstack(df["Embedding"].values)
y = df["DMS_score"].values

# Convert to PyTorch tensors and move to GPU
X_tensor = torch.tensor(X, dtype=torch.float32, device=device)
y_tensor = torch.tensor(y, dtype=torch.float32, device=device).view(-1, 1)  # Reshape for MLP

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)


In [46]:
from scipy.stats import spearmanr

# Define a simple MLP regression model using GPU
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single output value
        )

    def forward(self, x):
        return self.model(x)

# Initialize model, loss function, and optimizer
model = MLPRegressor(input_dim=X.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train)
    loss = criterion(predictions, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluate on test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_test)
    test_loss = criterion(test_predictions, y_test)
print(f"Test MSE: {test_loss.item():.4f}")

spearman_corr_xgb, _ = spearmanr(y_test.cpu().numpy().flatten(), test_predictions.cpu().numpy().flatten())
print(f"MLP Spearman Correlation: {spearman_corr_xgb:.4f}")

Epoch 10/100, Loss: 0.0467
Epoch 20/100, Loss: 0.0423
Epoch 30/100, Loss: 0.0413
Epoch 40/100, Loss: 0.0412
Epoch 50/100, Loss: 0.0412
Epoch 60/100, Loss: 0.0412
Epoch 70/100, Loss: 0.0412
Epoch 80/100, Loss: 0.0412
Epoch 90/100, Loss: 0.0412
Epoch 100/100, Loss: 0.0412
Test MSE: 0.0497
MLP Spearman Correlation: 0.1130


In [47]:
import xgboost as xgb
from scipy.stats import spearmanr

# Convert to DMatrix (XGBoost's optimized data structure)
dtrain = xgb.DMatrix(X_train.cpu().numpy(), label=y_train.cpu().numpy().flatten())
dtest = xgb.DMatrix(X_test.cpu().numpy(), label=y_test.cpu().numpy().flatten())

# Train XGBoost Model
params = {"objective": "reg:squarederror", "eval_metric": "rmse"}
xgb_model = xgb.train(params, dtrain, num_boost_round=100)

# Predict
y_pred_xgb = xgb_model.predict(dtest)

# Compute Spearman correlation
spearman_corr_xgb, _ = spearmanr(y_test.cpu().numpy().flatten(), y_pred_xgb)
print(f"XGBoost Spearman Correlation: {spearman_corr_xgb:.4f}")


XGBoost Spearman Correlation: 0.4483


In [48]:
import lightgbm as lgb

# Train LightGBM Model
lgb_train = lgb.Dataset(X_train.cpu().numpy(), label=y_train.cpu().numpy().flatten())
lgb_test = lgb.Dataset(X_test.cpu().numpy(), label=y_test.cpu().numpy().flatten(), reference=lgb_train)

params = {"objective": "regression", "metric": "rmse"}
lgb_model = lgb.train(params, lgb_train, num_boost_round=100)

# Predict
y_pred_lgb = lgb_model.predict(X_test.cpu().numpy())

# Compute Spearman correlation
spearman_corr_lgb, _ = spearmanr(y_test.cpu().numpy().flatten(), y_pred_lgb)
print(f"LightGBM Spearman Correlation: {spearman_corr_lgb:.4f}")


LightGBM Spearman Correlation: 0.4641


# Ensemble of LightGBM & Linear Regression 
We use linear regression to weigh the different models in the ensemble instead of taking the mean. We use boostrapping to train each model on a different subset of the data.

In [113]:
import lightgbm as lgb
import numpy as np
from scipy.stats import spearmanr
from tqdm import tqdm
from sklearn.utils import resample

# Define hyperparameters for LightGBM
lgb_params = {"objective": "regression", "metric": "rmse",'learning_rate': 0.04931791757724383, 'num_leaves': 85, 'min_child_samples': 48, 'max_depth': 7, 'subsample': 0.5749253739933733, 'colsample_bytree': 0.8371746709650121,'verbosity':-1}

# Number of bootstrapped models
num_bootstraps = 100
lgb_boot_models = []
lgb_predictions = []

progress = tqdm(total=num_bootstraps, desc="Training Bootstrapped LightGBM Models")

for i in range(num_bootstraps):
    # Bootstrap resampling
    X_resampled, y_resampled = resample(X_train.cpu().numpy(), y_train.cpu().numpy().flatten(), random_state=i)
    
    # Train LightGBM model on resampled data
    lgb_train = lgb.Dataset(X_resampled, label=y_resampled)
    lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=100)
    
    # Store trained model
    lgb_boot_models.append(lgb_model)

    # Predict on test set
    y_pred_lgb = lgb_model.predict(X_test.cpu().numpy())
    lgb_predictions.append(y_pred_lgb)

    progress.update(1)

progress.close()

# Average predictions from bootstrapped models
y_pred_ensemble = np.mean(np.column_stack(lgb_predictions), axis=1)

# Compute Spearman correlation
spearman_corr_ensemble, _ = spearmanr(y_test.cpu().numpy().flatten(), y_pred_ensemble)
print(f"Bootstrapped LightGBM Ensemble Spearman Correlation: {spearman_corr_ensemble:.4f}")


Training Bootstrapped LightGBM Models: 100%|██████████| 100/100 [00:38<00:00,  2.59it/s]

Bootstrapped LightGBM Ensemble Spearman Correlation: 0.4910





In [114]:
from sklearn.linear_model import Ridge

X_meta_train = np.column_stack(lgb_predictions)
y_train_np = y_test.cpu().numpy().flatten()

# Train a Ridge Regression as the meta-model
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_meta_train, y_train_np)

# Predict with stacked model
y_pred_stacked = meta_model.predict(X_meta_train)

# Compute Spearman correlation
spearman_corr_stacked, _ = spearmanr(y_train_np, y_pred_stacked)
print(f"Stacked LightGBM Model Spearman Correlation: {spearman_corr_stacked:.4f}")


Stacked LightGBM Model Spearman Correlation: 0.6692


# Predictions

In [115]:
df_test = pd.read_parquet('protein_embeddings_test.parquet')
X_unlabeled = df_test['Embedding'].values

In [116]:
X_unlabeled = np.vstack(X_unlabeled)
X_unlabeled = torch.tensor(X_unlabeled, dtype=torch.float32, device=device)

In [117]:
import numpy as np

# Compute UCB for LightGBM ensemble
beta = 1.5  # Adjust exploration factor

y_pred_all = np.column_stack([model.predict(X_unlabeled) for model in lgb_boot_models])
std_pred = np.std(y_pred_all, axis=1)    # Uncertainty (std dev)
mean_pred = meta_model.predict(y_pred_all)



In [118]:
ucb_scores = std_pred
top_mutations = np.argsort(ucb_scores)[-100:]
df_test.iloc[top_mutations]['mutant'].values

array(['P384F', 'R356I', 'E585F', 'A301G', 'L479I', 'D27A', 'H212Y',
       'F405G', 'D456N', 'P564T', 'F431G', 'F515G', 'F579G', 'F236G',
       'D244C', 'E601F', 'R644F', 'P495G', 'K47V', 'K410V', 'F171G',
       'K280V', 'L553T', 'E519F', 'R362F', 'E379F', 'F230G', 'D91Q',
       'D575Q', 'D494N', 'K269V', 'R472F', 'E567F', 'F581G', 'E427F',
       'F59G', 'F64G', 'N546G', 'K583V', 'F238G', 'F235G', 'R593F',
       'K240V', 'K408V', 'A510G', 'A395G', 'N445G', 'D209Q', 'K490V',
       'N323G', 'E441F', 'H612Y', 'R605F', 'D381Q', 'K145V', 'R30F',
       'K272V', 'F579T', 'R356F', 'K143V', 'K150V', 'P370F', 'W338N',
       'D37Q', 'N11G', 'R156F', 'A523G', 'D288Y', 'K467V', 'K333V',
       'R131F', 'K198V', 'N541G', 'E274W', 'E299F', 'R650F', 'K355V',
       'R387F', 'E15F', 'R50F', 'E592F', 'A352G', 'N536G', 'E599F',
       'E39F', 'E552F', 'E636F', 'R444F', 'K596V', 'K287V', 'N614G',
       'Q306C', 'A349G', 'D624A', 'N486G', 'W94P', 'R43F', 'D494Q',
       'N74G', 'D613N'], dtype=ob

In [112]:
arr = list(df_test.iloc[top_mutations]['mutant'].values)
f = open('query.txt','w')
for i in arr:
    f.write(i+'\n')
f.close()

# Generating protein_embeddings_test.parquet

In [None]:
import torch
import esm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained ESM model and move it to GPU
esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # Example: ESM-2 model
batch_converter = alphabet.get_batch_converter()
esm_model = esm_model.to(device)
esm_model.eval()  # Set to eval mode

# Extract ESM embeddings using GPU
def extract_esm_embedding(sequence):
    batch_labels, batch_strs, batch_tokens = batch_converter([(None, sequence)])
    batch_tokens = batch_tokens.to(device)  # Move input to GPU

    with torch.no_grad():
        results = esm_model(batch_tokens, repr_layers=[33], return_contacts=False)
    
    token_representations = results["representations"][33]  # Use final layer
    sequence_embedding = token_representations.mean(dim=1).squeeze().cpu().numpy()  # Move back to CPU for NumPy
    return sequence_embedding

Using device: mps


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /Users/saianoopavunuri/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /Users/saianoopavunuri/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


In [9]:
df_test = pd.read_csv('test.csv')
df_test['Sequence'] = df_test['mutant'].apply(lambda x: seq[:int(x[1:-1])] + x[-1] + seq[int(x[1:-1])+1:])
df_test['Embedding'] = df_test['Sequence'].apply(lambda seq: extract_esm_embedding(seq))
df_test.to_parquet("protein_embeddings_test.parquet", engine="pyarrow")

# Finding Optimal Hyperparams for LightGBM

In [35]:
#Best LightGBM Parameters: {'learning_rate': 0.04931791757724383, 'num_leaves': 85, 'min_child_samples': 48, 'max_depth': 7, 'subsample': 0.5749253739933733, 'colsample_bytree': 0.8371746709650121}
import optuna

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 10, 100),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
    }

    lgb_train = lgb.Dataset(X_train.cpu().numpy(), label=y_train.cpu().numpy().flatten())
    lgb_model = lgb.train(params, lgb_train, num_boost_round=100)
    
    y_pred = lgb_model.predict(X_test.cpu().numpy())
    spearman_corr, _ = spearmanr(y_test.cpu().numpy().flatten(), y_pred)
    
    return spearman_corr

# Run Bayesian Optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Best parameters
print("Best LightGBM Parameters:", study.best_params)


[I 2025-03-10 00:15:24,680] A new study created in memory with name: no-name-f5c83e0b-a90a-424a-9b32-5231e3e74cd6
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
[I 2025-03-10 00:15:27,372] Trial 0 finished with value: 0.48682221872956294 and parameters: {'learning_rate': 0.020241813661913466, 'num_leaves': 75, 'min_child_samples': 17, 'max_depth': 9, 'subsample': 0.9668683023238955, 'colsample_bytree': 0.6882141183975881}. Best is trial 0 with value: 0.48682221872956294.
[I 2025-03-10 00:15:29,431] Trial 1 finished with value: 0.4819516931547437 and parameters: {'learning_rate': 0.05180092726555603, 'num_leaves': 80, 'min_child_samples': 14, 'max_depth': 6, 'subsample': 0.877212391871884, 'colsample_bytree': 0.8325616995237557}. Best is trial 0 with value: 0.48682221872956294.
[I 2025-03-10 00:15:30,595] Trial 2 finished with 

Best LightGBM Parameters: {'learning_rate': 0.04931791757724383, 'num_leaves': 85, 'min_child_samples': 48, 'max_depth': 7, 'subsample': 0.5749253739933733, 'colsample_bytree': 0.8371746709650121}


# Best Beta
There is not much difference amongst the top options, so we just use 1.5 for simplicity.

In [67]:
import optuna
from scipy.stats import spearmanr

# Function to optimize beta
def optimize_beta(trial):
    beta = trial.suggest_uniform("beta", 0.1, 5.0)  # Search range

    # Compute UCB scores with candidate beta
    mean_pred = y_pred_stacked

    ucb_scores = mean_pred + beta * std_pred


    # Compute Spearman correlation on these selected samples
    spearman_corr, _ = spearmanr(y_test.cpu().numpy().flatten(), ucb_scores)

    return spearman_corr

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(optimize_beta, n_trials=50)

# Get best beta value
best_beta = study.best_params["beta"]
print(f"Optimized Beta: {best_beta:.4f}")

[I 2025-03-17 16:03:03,560] A new study created in memory with name: no-name-851416c2-c7fe-49b6-80d7-f0afdc401062
  beta = trial.suggest_uniform("beta", 0.1, 5.0)  # Search range
[I 2025-03-17 16:03:03,563] Trial 0 finished with value: 0.6385678700732556 and parameters: {'beta': 3.3069204246460018}. Best is trial 0 with value: 0.6385678700732556.
[I 2025-03-17 16:03:03,564] Trial 1 finished with value: 0.6579983359928808 and parameters: {'beta': 1.4317035811412975}. Best is trial 1 with value: 0.6579983359928808.
[I 2025-03-17 16:03:03,565] Trial 2 finished with value: 0.6536122811565609 and parameters: {'beta': 1.8821328093500957}. Best is trial 1 with value: 0.6579983359928808.
[I 2025-03-17 16:03:03,566] Trial 3 finished with value: 0.6241623326283892 and parameters: {'beta': 4.752371274581152}. Best is trial 1 with value: 0.6579983359928808.
[I 2025-03-17 16:03:03,567] Trial 4 finished with value: 0.6446123639266328 and parameters: {'beta': 2.8106424962011256}. Best is trial 1 with

Optimized Beta: 0.1184
