In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset
import random
from copy import deepcopy
import pandas as pd
from scipy.stats import spearmanr
import argparse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load Dataset

In [2]:
df_train = pd.read_parquet("../protein_embeddings.parquet")
df_train.head()

Unnamed: 0,mutant,DMS_score,Sequence,Embedding
0,M0Y,0.273,YVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,"[0.027215108, -0.07895891, 0.024665592, 0.0871..."
1,M0W,0.2857,WVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,"[0.027446767, -0.08002912, 0.024973774, 0.0876..."
2,M0V,0.2153,VVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,"[0.027746048, -0.08058312, 0.024816252, 0.0884..."
3,M0T,0.3122,TVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,"[0.027021278, -0.07973976, 0.024042634, 0.0892..."
4,M0S,0.218,SVNEARGNSSLNPCLEGSASSGSESSKDSSRCSTPGLDPERHERLR...,"[0.027436633, -0.07952297, 0.02438334, 0.09035..."


# Apply LORA

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load tokenizer and model
model_name = "facebook/esm2_t6_8M_UR50D"  # Smallest ESM-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Predict a single fitness score


  from .autonotebook import tqdm as notebook_tqdm
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,  # Rank of LoRA
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05,
    target_modules=["query", "key", "value"]  # Apply LoRA to transformer attention layers
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 287,361 || all params: 8,127,803 || trainable%: 3.5355


In [5]:
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples["Sequence"], padding="max_length", truncation=True, max_length=720)

# Convert Pandas DataFrame to Dataset
dataset = Dataset.from_pandas(df_train)
dataset = dataset.map(preprocess_function, batched=True)
dataset = dataset.rename_column("DMS_score", "labels")  # Rename for HF training
dataset = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test


Map: 100%|██████████| 1140/1140 [00:02<00:00, 517.97 examples/s]


In [6]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./protein_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.02,
    fp16=True,
    report_to="none"  # Avoids WandB integration unless needed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=5700, training_loss=0.04123162217307509, metrics={'train_runtime': 542.5457, 'train_samples_per_second': 84.048, 'train_steps_per_second': 10.506, 'total_flos': 1534355519616000.0, 'train_loss': 0.04123162217307509, 'epoch': 50.0})

# Eval

In [10]:
import torch
from scipy.stats import spearmanr

# Get predictions
preds = trainer.predict(dataset["test"])
predicted_scores = preds.predictions.squeeze()  # Remove extra dimension

# Compute Spearman Correlation
actual_scores = dataset["test"]["labels"]
spearman_corr, _ = spearmanr(predicted_scores, actual_scores)

print(f"Spearman Correlation: {spearman_corr:.4f}")


Spearman Correlation: 0.3481


# Save

In [8]:
model.save_pretrained("./fine_tuned_protein_model")
tokenizer.save_pretrained("./fine_tuned_protein_model")


('./fine_tuned_protein_model/tokenizer_config.json',
 './fine_tuned_protein_model/special_tokens_map.json',
 './fine_tuned_protein_model/vocab.txt',
 './fine_tuned_protein_model/added_tokens.json')

Eval

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_protein_model")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_protein_model")


Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Error(s) in loading state_dict for EsmForSequenceClassification:
	size mismatch for classifier.modules_to_save.default.out_proj.weight: copying a param with shape torch.Size([1, 320]) from checkpoint, the shape in current model is torch.Size([2, 320]).
	size mismatch for classifier.modules_to_save.default.out_proj.bias: copying a param with shape torch.Size([1]) from checkpoint, the shape in current model is torch.Size([2]).