In [1]:
# Import packages
import torch
from tqdm import tqdm
import math
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
from Bio import SeqIO 
import json

2025-04-18 17:08:16.634109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-18 17:08:16.652484: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-18 17:08:16.652517: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 17:08:16.665639: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load pretrained and finetuned models

In [2]:
pwd

'/home/sdowell/scratch/Thesis/ADP1/results'

In [3]:

base_model_name = "facebook/esm2_t30_150M_UR50D"
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/esm2_dgoa_finetune_1/checkpoint-3000"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

# Load base model
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name, trust_remote_code=True)
model_pretrained.eval()

# Load finetuned model with LoRA adapter
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name, trust_remote_code=True)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)
model_finetuned.eval()

# test set file
test_set_path = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"



# Compute perplexity for both models

In [5]:
def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of amino acid sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq)
        sequences.append(seq_str)
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

def evaluate_models(pretrained_model, finetuned_model, tokenizer, test_sequences, 
                    batch_size=16, mask_prob=0.15, verbose=True):
    """
    Evaluate and compare perplexity between pretrained and finetuned models.
    """
    # Preprocess sequences
    test_set_path = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"
    sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

    # Evaluate pretrained model
    print("Evaluating pretrained model...")
    ppl_pretrained = compute_mlm_perplexity_batched(
        pretrained_model, tokenizer, sequences, batch_size, mask_prob
    )
    
    # Evaluate finetuned model
    print("Evaluating finetuned model...")
    ppl_finetuned = compute_mlm_perplexity_batched(
        finetuned_model, tokenizer, sequences, batch_size, mask_prob
    )
    
    # Report results
    print("\n----- PERPLEXITY RESULTS -----")
    print(f"Pretrained Model: {ppl_pretrained:.4f}")
    print(f"Finetuned Model:  {ppl_finetuned:.4f}")
    print(f"Delta (Fine-Pre): {ppl_finetuned - ppl_pretrained:.4f}")
    print(f"Relative Change:  {(ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100:.2f}%")
    
    # Optionally save results
    if verbose:
        return {
            "pretrained": ppl_pretrained,
            "finetuned": ppl_finetuned,
            "delta": ppl_finetuned - ppl_pretrained,
            "relative_change": (ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100
        }

# Prepare Test dataset

In [6]:
# Define fasta test set path
test_set_path = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"

# Read in sequences from fasta into list
test_sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

print(f"Loaded {len(test_sequences)} sequences")

Loaded 1953 sequences


# Delta perplexity

In [None]:
def compute_mlm_perplexity_multiple_runs(model, tokenizer, sequences, batch_size=16, mask_prob=0.15, num_runs=10):
    """
    Computes perplexity for masked language models (MLM) over multiple runs.
    
    For each run, a new random masking is applied to the input sequences, and the MLM loss is computed.
    The final perplexity is calculated by taking the exponential of the average loss across all masked tokens.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequences (list[str]): List of sequences (e.g., protein sequences) to evaluate.
        batch_size (int): Number of sequences per batch.
        mask_prob (float): Fraction of tokens to mask in each sequence.
        num_runs (int): Number of runs to perform (to capture run-to-run variability).
        
    Returns:
        tuple: (average_perplexity, standard_deviation) calculated across the multiple runs.
    """
    perplexities = []
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Disable dropout and other training-specific layers
    
    for run in range(num_runs):
        total_loss = 0.0
        total_masked_tokens = 0
        
        # Create batches from sequences
        batches = [sequences[i:i + batch_size] for i in range(0, len(sequences), batch_size)]
        
        for batch in tqdm(batches, desc=f"Run {run+1}/{num_runs}"):
            # Tokenize and pad the batch
            encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            input_ids = encoded_inputs.input_ids.to(device)
            attention_mask = encoded_inputs.attention_mask.to(device)
            
            # Prepare copies for masking
            masked_input_ids = input_ids.clone()
            # Set labels to -100 initially; only masked tokens will have a valid label.
            labels = torch.full_like(input_ids, -100)
            
            # Apply random masking per sequence in the batch
            for i in range(input_ids.shape[0]):
                # Identify positions with actual (non-padding) tokens
                active_indices = torch.where(attention_mask[i] == 1)[0]
                num_to_mask = max(1, int(len(active_indices) * mask_prob))
                # Randomly sample tokens to mask
                to_mask = active_indices[torch.randperm(len(active_indices))[:num_to_mask]]
                labels[i, to_mask] = input_ids[i, to_mask]
                masked_input_ids[i, to_mask] = tokenizer.mask_token_id
            
            # Compute loss for the current batch
            with torch.no_grad():
                outputs = model(
                    input_ids=masked_input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            # Count how many tokens were masked (i.e. tokens with a label other than -100)
            batch_masked_tokens = (labels != -100).sum().item()
            if batch_masked_tokens > 0:
                total_loss += outputs.loss.item() * batch_masked_tokens
                total_masked_tokens += batch_masked_tokens
        
        # Compute perplexity for this run
        if total_masked_tokens > 0:
            avg_loss = total_loss / total_masked_tokens
            perplexity = math.exp(avg_loss)
        else:
            perplexity = float('inf')
        
        perplexities.append(perplexity)
    
    # Calculate average and standard deviation of perplexities over the runs
    average_perplexity = sum(perplexities) / len(perplexities)
    variance = sum((p - average_perplexity) ** 2 for p in perplexities) / len(perplexities)
    std_dev = math.sqrt(variance)
    
    return average_perplexity, std_dev

# Example usage:
# Assume `model`, `tokenizer`, and `test_sequences` are predefined.
# test_sequences can be loaded from a FASTA file using the function below:

def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of protein sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        sequences.append(str(record.seq))
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

# Usage:
test_sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)
avg_perp, perp_std = compute_mlm_perplexity_multiple_runs(model_pretrained, tokenizer, test_sequences)
print(f"Pretrained Average Perplexity: {avg_perp:.2f} ± {perp_std:.2f}")
avg_perp, perp_std = compute_mlm_perplexity_multiple_runs(model_finetuned, tokenizer, test_sequences)
print(f"Finetuned Average Perplexity: {avg_perp:.2f} ± {perp_std:.2f}")

In [7]:

def compute_mlm_perplexity_multiple_runs(model, tokenizer, sequences, batch_size=16, mask_prob=0.15, num_runs=10):
    """
    Computes perplexity for masked language models (MLM) over multiple runs.
    
    For each run, a new random masking is applied to the input sequences, and the MLM loss is computed.
    The final perplexity is calculated by taking the exponential of the average loss across all masked tokens.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequences (list[str]): List of sequences (e.g., protein sequences) to evaluate.
        batch_size (int): Number of sequences per batch.
        mask_prob (float): Fraction of tokens to mask in each sequence.
        num_runs (int): Number of runs to perform (to capture run-to-run variability).
        
    Returns:
        tuple: (average_perplexity, standard_deviation) calculated across the multiple runs.
    """
    perplexities = []
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Disable dropout and other training-specific layers
    
    for run in range(num_runs):
        total_loss = 0.0
        total_masked_tokens = 0
        
        # Create batches from sequences
        batches = [sequences[i:i + batch_size] for i in range(0, len(sequences), batch_size)]
        
        for batch in tqdm(batches, desc=f"Run {run+1}/{num_runs}"):
            # Tokenize and pad the batch
            encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            input_ids = encoded_inputs.input_ids.to(device)
            attention_mask = encoded_inputs.attention_mask.to(device)
            
            # Prepare copies for masking
            masked_input_ids = input_ids.clone()
            # Set labels to -100 initially; only masked tokens will have a valid label.
            labels = torch.full_like(input_ids, -100)
            
            # Apply random masking per sequence in the batch
            for i in range(input_ids.shape[0]):
                # Identify positions with actual (non-padding) tokens
                active_indices = torch.where(attention_mask[i] == 1)[0]
                num_to_mask = max(1, int(len(active_indices) * mask_prob))
                # Randomly sample tokens to mask
                to_mask = active_indices[torch.randperm(len(active_indices))[:num_to_mask]]
                labels[i, to_mask] = input_ids[i, to_mask]
                masked_input_ids[i, to_mask] = tokenizer.mask_token_id
            
            # Compute loss for the current batch
            with torch.no_grad():
                outputs = model(
                    input_ids=masked_input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            # Count how many tokens were masked (i.e. tokens with a label other than -100)
            batch_masked_tokens = (labels != -100).sum().item()
            if batch_masked_tokens > 0:
                total_loss += outputs.loss.item() * batch_masked_tokens
                total_masked_tokens += batch_masked_tokens
        
        # Compute perplexity for this run
        if total_masked_tokens > 0:
            avg_loss = total_loss / total_masked_tokens
            perplexity = math.exp(avg_loss)
        else:
            perplexity = float('inf')
        
        perplexities.append(perplexity)
    
    # Calculate average perplexity and standard deviation across runs
    average_perplexity = sum(perplexities) / len(perplexities)
    variance = sum((p - average_perplexity) ** 2 for p in perplexities) / len(perplexities)
    std_dev = math.sqrt(variance)
    
    return average_perplexity, std_dev

# Function to load sequences from a FASTA file
def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of protein sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        sequences.append(str(record.seq))
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

# --- Main evaluation and saving results to JSON --- #

# Load test sequences
test_fasta_path = test_set_path 
test_sequences = load_sequences_from_fasta(test_fasta_path, max_seqs=None)

# Define testing parameters for MLM (with masking)
test_params = {
    "batch_size": 16,
    "mask_prob": 0.15,
    "num_runs": 10
}

# Evaluate pretrained and finetuned models
print("Evaluating pretrained model...")
avg_pretrained, std_pretrained = compute_mlm_perplexity_multiple_runs(
    model_pretrained, tokenizer, test_sequences, **test_params
)

print("Evaluating finetuned model...")
avg_finetuned, std_finetuned = compute_mlm_perplexity_multiple_runs(
    model_finetuned, tokenizer, test_sequences, **test_params
)

# Organize results in a dictionary
results = {
    "pretrained": {
         "average_perplexity": avg_pretrained,
         "std_dev": std_pretrained
    },
    "finetuned": {
         "average_perplexity": avg_finetuned,
         "std_dev": std_finetuned
    },
    "delta": avg_finetuned - avg_pretrained,
    "relative_change": ((avg_finetuned - avg_pretrained) / avg_pretrained * 100) if avg_pretrained != 0 else None
}

# Save the results dictionary to a JSON file
output_json_path = "esm_dgoa_perplexity_results.json"
with open(output_json_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {output_json_path}")


Evaluating pretrained model...


Run 1/10:   0%|                                                                                                                                                                     | 0/123 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Run 1/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:13<00:00,  8.84it/s]
Run 2/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:13<00:00,  9.25it/s]
Run 3/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:13<00:00,  9.24it/s]
Run 4/10: 100%|█████████████████████████

Evaluating finetuned model...


Run 1/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:33<00:00,  3.68it/s]
Run 2/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:34<00:00,  3.62it/s]
Run 3/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:34<00:00,  3.61it/s]
Run 4/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:33<00:00,  3.65it/s]
Run 5/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 

Results saved to esm_dgoa_perplexity_results.json





In [8]:
pwd

'/home/sdowell/scratch/Thesis/ADP1/results'