In [2]:
# Import packages
import torch
from tqdm import tqdm
import math
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
from Bio import SeqIO 
import json

2025-04-17 18:44:50.853442: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-17 18:44:50.872195: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-17 18:44:50.872226: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-17 18:44:50.885161: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load pretrained and finetuned models

In [5]:
pwd

'/home/sdowell/scratch/Thesis/ADP1'

In [6]:

base_model_name = "facebook/esm2_t30_150M_UR50D"
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/esm2_dgoa_finetune_1/checkpoint-3000"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load base model
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_pretrained.eval()

# Load finetuned model with LoRA adapter
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)
model_finetuned.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): EsmForMaskedLM(
      (esm): EsmModel(
        (embeddings): EsmEmbeddings(
          (word_embeddings): Embedding(33, 640, padding_idx=1)
          (dropout): Dropout(p=0.0, inplace=False)
          (position_embeddings): Embedding(1026, 640, padding_idx=1)
        )
        (encoder): EsmEncoder(
          (layer): ModuleList(
            (0-29): 30 x EsmLayer(
              (attention): EsmAttention(
                (self): EsmSelfAttention(
                  (query): Linear(in_features=640, out_features=640, bias=True)
                  (key): lora.Linear(
                    (base_layer): Linear(in_features=640, out_features=640, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=640, out_features=8, bias=False)
                    )
  

# Compute perplexity for both models

In [7]:
# Pseudoperplexity

def compute_mlm_perplexity_batched(model, tokenizer, sequences, batch_size=16, mask_prob=0.15):
    """
    Batch-based perplexity computation for masked language models.
    """
    total_loss = 0.0
    total_masked_tokens = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Create batches
    batches = [sequences[i:i + batch_size] for i in range(0, len(sequences), batch_size)]
    
    for batch in tqdm(batches):
        # Tokenize and pad the batch
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = encoded_inputs.input_ids.to(device)
        attention_mask = encoded_inputs.attention_mask.to(device)
        
        # Create masks and labels
        masked_input_ids = input_ids.clone()
        labels = torch.full_like(input_ids, -100)
        
        # Apply masking (separately for each sequence in batch)
        for i in range(input_ids.shape[0]):
            # Only consider positions with actual tokens (not padding)
            active_indices = torch.where(attention_mask[i] == 1)[0]
            
            # Randomly select tokens to mask
            num_to_mask = max(1, int(len(active_indices) * mask_prob))
            to_mask = active_indices[torch.randperm(len(active_indices))[:num_to_mask]]
            
            # Set labels and mask tokens
            labels[i, to_mask] = input_ids[i, to_mask]
            masked_input_ids[i, to_mask] = tokenizer.mask_token_id
        
        with torch.no_grad():
            outputs = model(
                masked_input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # Calculate loss only on masked tokens
            batch_masked_tokens = (labels != -100).sum().item()
            if batch_masked_tokens > 0:
                total_loss += outputs.loss.item() * batch_masked_tokens
                total_masked_tokens += batch_masked_tokens
    
    avg_loss = total_loss / total_masked_tokens if total_masked_tokens > 0 else float('inf')
    perplexity = math.exp(avg_loss)
    return perplexity

def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of amino acid sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq)
        sequences.append(seq_str)
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

def evaluate_models(pretrained_model, finetuned_model, tokenizer, test_sequences, 
                    batch_size=16, mask_prob=0.15, verbose=True):
    """
    Evaluate and compare perplexity between pretrained and finetuned models.
    """
    # Preprocess sequences
    test_set_path = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"
    sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

    # Evaluate pretrained model
    print("Evaluating pretrained model...")
    ppl_pretrained = compute_mlm_perplexity_batched(
        pretrained_model, tokenizer, sequences, batch_size, mask_prob
    )
    
    # Evaluate finetuned model
    print("Evaluating finetuned model...")
    ppl_finetuned = compute_mlm_perplexity_batched(
        finetuned_model, tokenizer, sequences, batch_size, mask_prob
    )
    
    # Report results
    print("\n----- PERPLEXITY RESULTS -----")
    print(f"Pretrained Model: {ppl_pretrained:.4f}")
    print(f"Finetuned Model:  {ppl_finetuned:.4f}")
    print(f"Delta (Fine-Pre): {ppl_finetuned - ppl_pretrained:.4f}")
    print(f"Relative Change:  {(ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100:.2f}%")
    
    # Optionally save results
    if verbose:
        return {
            "pretrained": ppl_pretrained,
            "finetuned": ppl_finetuned,
            "delta": ppl_finetuned - ppl_pretrained,
            "relative_change": (ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100
        }

# Prepare Test dataset

In [8]:
# Define fasta test set path
test_set_path = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"

# Read in sequences from fasta into list
test_sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

print(f"Loaded {len(test_sequences)} sequences")

Loaded 1953 sequences


# Delta perplexity

In [41]:
# Set up models and tokenizer as in your original code

# Define testing parameters
test_params = {
    "batch_size": 16,
    "mask_prob": 0.15,
    "verbose": True
}

# Evaluate
results = evaluate_models(
    model_pretrained, 
    model_finetuned, 
    tokenizer, 
    test_sequences, 
    **test_params
)

# Save results to JSON file
with open("perplexity_results.json", "w") as f:
    json.dump(results, f, indent=4)

Evaluating pretrained model...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:13<00:00,  9.26it/s]


Evaluating finetuned model...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:13<00:00,  9.00it/s]


----- PERPLEXITY RESULTS -----
Pretrained Model: 4.0412
Finetuned Model:  1.0673
Delta (Fine-Pre): -2.9740
Relative Change:  -73.59%





In [44]:
import torch
from tqdm import tqdm
import math
from Bio import SeqIO

def compute_causal_perplexity_batched(model, tokenizer, sequences, batch_size=16):
    """
    Batch-based perplexity computation for causal (autoregressive) language models.
    
    Args:
        model: The causal language model.
        tokenizer: The tokenizer corresponding to the model.
        sequences (list[str]): List of sequences to evaluate.
        batch_size (int): Number of sequences per batch.
        
    Returns:
        float: The computed perplexity.
    """
    # Ensure the tokenizer has a pad token; if not, use the eos_token as the pad token.
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    total_loss = 0.0
    total_tokens = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # disable dropout etc.
    
    # Create batches of sequences
    batches = [sequences[i:i + batch_size] for i in range(0, len(sequences), batch_size)]
    
    for batch in tqdm(batches, desc="Evaluating batches"):
        # Tokenize and pad the batch
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = encoded_inputs.input_ids.to(device)
        attention_mask = encoded_inputs.attention_mask.to(device)
        
        # For causal models, the labels are simply the input_ids.
        labels = input_ids.clone()
        
        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
        
        # Calculate the number of tokens (exclude padding)
        num_tokens = attention_mask.sum().item()
        total_loss += outputs.loss.item() * num_tokens
        total_tokens += num_tokens
    
    avg_loss = total_loss / total_tokens if total_tokens > 0 else float('inf')
    perplexity = math.exp(avg_loss)
    return perplexity

def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of amino acid sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq)
        sequences.append(seq_str)
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

def evaluate_models(pretrained_model, finetuned_model, tokenizer, test_sequences, 
                    batch_size=16, verbose=True):
    """
    Evaluate and compare perplexity between pretrained and finetuned causal models.
    
    Args:
        pretrained_model: The original (pretrained) model.
        finetuned_model: The model after fine-tuning.
        tokenizer: The tokenizer corresponding to the models.
        test_sequences (list[str]): List of test sequences.
        batch_size (int): Batch size for evaluation.
        verbose (bool): Whether to print and return evaluation results.
        
    Returns:
        dict: A dictionary with perplexity results if verbose is True.
    """
    # Evaluate pretrained model
    print("Evaluating pretrained model...")
    ppl_pretrained = compute_causal_perplexity_batched(
        pretrained_model, tokenizer, test_sequences, batch_size
    )
    
    # Evaluate finetuned model
    print("Evaluating finetuned model...")
    ppl_finetuned = compute_causal_perplexity_batched(
        finetuned_model, tokenizer, test_sequences, batch_size
    )
    
    # Report results
    print("\n----- PERPLEXITY RESULTS -----")
    print(f"Pretrained Model: {ppl_pretrained:.4f}")
    print(f"Finetuned Model:  {ppl_finetuned:.4f}")
    print(f"Delta (Fine - Pre): {ppl_finetuned - ppl_pretrained:.4f}")
    if ppl_pretrained != 0:
        print(f"Relative Change:  {(ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100:.2f}%")
    
    if verbose:
        return {
            "pretrained": ppl_pretrained,
            "finetuned": ppl_finetuned,
            "delta": ppl_finetuned - ppl_pretrained,
            "relative_change": ((ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100) if ppl_pretrained != 0 else None
        }


Average Perplexity: 1.07 ± 0.00


In [45]:
# Usage example:
avg_perp, perp_std = compute_mlm_perplexity_multiple_runs(model_pretrained, tokenizer, test_sequences)
print(f"Average Perplexity: {avg_perp:.2f} ± {perp_std:.2f}")

Average Perplexity: 4.03 ± 0.02


# ProGen2

In [10]:

base_model_name = "hugohrban/progen2-small"
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/progen2_dgoa_finetune_1/checkpoint-3000"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

# Load base model
model_pretrained = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
model_pretrained.eval()

# Load finetuned model with LoRA adapter
model_with_adapter = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)
model_finetuned.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ProGenForCausalLM(
      (transformer): ProGenModel(
        (wte): Embedding(32, 1024)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-11): 12 x ProGenBlock(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): ProGenAttention(
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (qkv_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
      

In [12]:
import torch
from tqdm import tqdm
import math
from Bio import SeqIO

def compute_autoregressive_perplexity_multiple_runs(model, tokenizer, sequences, batch_size=16, num_runs=10):
    """
    Computes perplexity for causal (autoregressive) models (e.g., ProGen2)
    across multiple runs and returns the average perplexity and standard deviation.
    
    For causal models, we use the unmodified input as labels (i.e. no masking)
    and compute the loss over all non-padding tokens.
    
    Args:
        model: The causal language model.
        tokenizer: The corresponding tokenizer.
        sequences (list[str]): List of sequences to evaluate.
        batch_size (int): Number of sequences per batch.
        num_runs (int): Number of evaluation runs.
        
    Returns:
        tuple: (average_perplexity, standard_deviation)
    """
    perplexities = []
    
    # Ensure the tokenizer has a pad token; if not, set it to the eos token.
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Ensure dropout is disabled.
    
    for run in range(num_runs):
        total_loss = 0.0
        total_tokens = 0
        
        # Create batches from sequences
        batches = [sequences[i:i + batch_size] for i in range(0, len(sequences), batch_size)]
        
        for batch in tqdm(batches, desc=f"Run {run+1}/{num_runs}"):
            # Tokenize and pad the batch
            encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            input_ids = encoded_inputs.input_ids.to(device)
            attention_mask = encoded_inputs.attention_mask.to(device)
            
            # For causal models, use the input_ids as labels.
            labels = input_ids.clone()
            
            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            # Count non-padding tokens.
            num_tokens = attention_mask.sum().item()
            total_loss += outputs.loss.item() * num_tokens
            total_tokens += num_tokens
        
        # Compute perplexity for this run.
        if total_tokens > 0:
            avg_loss = total_loss / total_tokens
            perplexity = math.exp(avg_loss)
        else:
            perplexity = float('inf')
        
        perplexities.append(perplexity)
    
    # Compute average perplexity and standard deviation across runs.
    average_perplexity = sum(perplexities) / len(perplexities)
    variance = sum((p - average_perplexity) ** 2 for p in perplexities) / len(perplexities)
    std_dev = math.sqrt(variance)
    
    return average_perplexity, std_dev

# Usage example:
# Load sequences from FASTA file (modify the path as needed)
def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of amino acid sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq)
        sequences.append(seq_str)
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

# Assuming you have defined 'model_pretrained', 'model_finetuned', and 'tokenizer'
# and that 'test_sequences' is a list of protein sequences:
test_sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

# Define testing parameters for causal models (mask_prob is no longer used)
test_params = {
    "batch_size": 16,
    "num_runs": 10,
}

# Evaluate the models using the updated function
print("Evaluating pretrained model...")
avg_pretrained, std_pretrained = compute_autoregressive_perplexity_multiple_runs(
    model_pretrained, tokenizer, test_sequences, **test_params
)

print("Evaluating finetuned model...")
avg_finetuned, std_finetuned = compute_autoregressive_perplexity_multiple_runs(
    model_finetuned, tokenizer, test_sequences, **test_params
)

print("\n----- PERPLEXITY RESULTS -----")
print(f"Pretrained Model: {avg_pretrained:.4f} ± {std_pretrained:.4f}")
print(f"Finetuned Model:  {avg_finetuned:.4f} ± {std_finetuned:.4f}")
print(f"Delta (Finetuned - Pretrained): {avg_finetuned - avg_pretrained:.4f}")
if avg_pretrained != 0:
    print(f"Relative Change: {(avg_finetuned - avg_pretrained) / avg_pretrained * 100:.2f}%")


Evaluating pretrained model...


Run 1/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:11<00:00, 10.81it/s]
Run 2/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.40it/s]
Run 3/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.39it/s]
Run 4/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.41it/s]
Run 5/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 

Evaluating finetuned model...


Run 1/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.19it/s]
Run 2/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.19it/s]
Run 3/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:11<00:00, 11.18it/s]
Run 4/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:10<00:00, 11.19it/s]
Run 5/10: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 


----- PERPLEXITY RESULTS -----
Pretrained Model: 14.2677 ± 0.0000
Finetuned Model:  1.8445 ± 0.0000
Delta (Finetuned - Pretrained): -12.4232
Relative Change: -87.07%



