In [8]:
# Import packages
import torch
from tqdm import tqdm
import math
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
from Bio import SeqIO 
import json

# Load pretrained and finetuned models

In [9]:
pwd

'/home/sdowell/scratch/Thesis/Ch3Metrics'

In [10]:

base_model_name = "hugohrban/progen2-small"
adapter_checkpoint = "/home/sdowell/scratch/Thesis/BenchmarkingFinetuning/runs/progen2_151m_ecoli_finetuning_1/checkpoint-11500"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)

# Load base model
model_pretrained = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
model_pretrained.eval()

# Load finetuned model with LoRA adapter
model_with_adapter = AutoModelForCausalLM.from_pretrained(base_model_name, trust_remote_code=True)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)
model_finetuned.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ProGenForCausalLM(
      (transformer): ProGenModel(
        (wte): Embedding(32, 1024)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-11): 12 x ProGenBlock(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): ProGenAttention(
              (attn_dropout): Dropout(p=0.0, inplace=False)
              (resid_dropout): Dropout(p=0.0, inplace=False)
              (qkv_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
      

# Compute perplexity for both models

In [11]:
import torch
from tqdm import tqdm
import math
from Bio import SeqIO

def compute_causal_perplexity_batched(model, tokenizer, sequences, batch_size=16):
    # ensure we have a PAD token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device).eval()

    total_loss = 0.0
    total_tokens = 0

    # Make batches
    for i in range(0, len(sequences), batch_size):
        batch = sequences[i : i + batch_size]

        # tokenize *without* truncation
        enc = tokenizer(batch,
                        padding=True,
                        return_tensors="pt")  # no truncation
        input_ids = enc.input_ids.to(device)
        attention_mask = enc.attention_mask.to(device)

        # prepare labels: mask pad tokens
        labels = input_ids.clone()
        labels[attention_mask == 0] = -100  # HuggingFace’s ignore_index

        with torch.no_grad():
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            # outputs.loss is already avg per *unmasked* token
            batch_loss = outputs.loss * attention_mask.sum().item()
            total_loss += batch_loss
            total_tokens += attention_mask.sum().item()

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


def load_sequences_from_fasta(fasta_path, max_seqs=None):
    """
    Reads protein sequences from a FASTA file.
    
    Args:
        fasta_path (str): Path to the FASTA file.
        max_seqs (int, optional): Maximum number of sequences to read.
        
    Returns:
        list[str]: List of amino acid sequences.
    """
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq_str = str(record.seq)
        sequences.append(seq_str)
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

def evaluate_models(pretrained_model, finetuned_model, tokenizer, test_sequences, 
                    batch_size=16, verbose=True):
    """
    Evaluate and compare perplexity between pretrained and finetuned causal models.
    
    Args:
        pretrained_model: The original (pretrained) model.
        finetuned_model: The model after fine-tuning.
        tokenizer: The tokenizer corresponding to the models.
        test_sequences (list[str]): List of test sequences.
        batch_size (int): Batch size for evaluation.
        verbose (bool): Whether to print and return evaluation results.
        
    Returns:
        dict: A dictionary with perplexity results if verbose is True.
    """
    # Evaluate pretrained model
    print("Evaluating pretrained model...")
    ppl_pretrained = compute_causal_perplexity_batched(
        pretrained_model, tokenizer, test_sequences, batch_size
    )
    
    # Evaluate finetuned model
    print("Evaluating finetuned model...")
    ppl_finetuned = compute_causal_perplexity_batched(
        finetuned_model, tokenizer, test_sequences, batch_size
    )
    
    # Report results
    print("\n----- PERPLEXITY RESULTS -----")
    print(f"Pretrained Model: {ppl_pretrained:.4f}")
    print(f"Finetuned Model:  {ppl_finetuned:.4f}")
    print(f"Delta (Fine - Pre): {ppl_finetuned - ppl_pretrained:.4f}")
    if ppl_pretrained != 0:
        print(f"Relative Change:  {(ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100:.2f}%")
    
    if verbose:
        return {
            "pretrained": ppl_pretrained,
            "finetuned": ppl_finetuned,
            "delta": ppl_finetuned - ppl_pretrained,
            "relative_change": ((ppl_finetuned - ppl_pretrained) / ppl_pretrained * 100) if ppl_pretrained != 0 else None
        }


# Prepare Test dataset

In [12]:
# Define fasta test set path
test_set_path = "/home/sdowell/scratch/Thesis/BenchmarkingFinetuning/dataset_splits/finetuning_dataset/test.fasta"

# Read in sequences from fasta into list
test_sequences = load_sequences_from_fasta(test_set_path, max_seqs=None)

print(f"Loaded {len(test_sequences)} sequences")

Loaded 469 sequences


# Delta perplexity

In [13]:

# Define testing parameters
test_params = {
    "batch_size": 16,
    "verbose": True
}


# Evaluate
results = evaluate_models(
    model_pretrained, 
    model_finetuned, 
    tokenizer, 
    test_sequences, 
    **test_params
)

# Save results to JSON file
#with open("progen2_151m_perplexity_results.json", "w") as f:
#    json.dump(results, f, indent=4)

Evaluating pretrained model...
Evaluating finetuned model...

----- PERPLEXITY RESULTS -----
Pretrained Model: 3.7176
Finetuned Model:  1.0250
Delta (Fine - Pre): -2.6926
Relative Change:  -72.43%


In [14]:
import torch
from tqdm import tqdm
import math
from Bio import SeqIO

def compute_autoregressive_perplexity_multiple_runs(
    model, tokenizer, sequences,
    batch_size=16, num_runs=10
):
    """
    Computes perplexity for causal (autoregressive) models (e.g., ProGen2)
    across multiple runs and returns the average perplexity and standard deviation.
    """

    # Ensure we have a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device).eval()

    perplexities = []

    for run in range(num_runs):
        total_loss = 0.0
        total_tokens = 0

        # Batch the sequences
        for i in range(0, len(sequences), batch_size):
            batch = sequences[i : i + batch_size]

            # Tokenize without truncation
            enc = tokenizer(
                batch,
                padding=True,
                return_tensors="pt"  # NO truncation!
            )
            input_ids = enc.input_ids.to(device)
            attention_mask = enc.attention_mask.to(device)

            # Prepare labels, masking padding tokens
            labels = input_ids.clone()
            labels[attention_mask == 0] = -100  # ignore_index

            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                # outputs.loss is mean over non-ignored tokens
                # multiply by token count to get summed loss
                token_count = attention_mask.sum().item()
                total_loss += outputs.loss.item() * token_count
                total_tokens += token_count

        # Compute run perplexity
        if total_tokens > 0:
            avg_loss = total_loss / total_tokens
            perplexities.append(math.exp(avg_loss))
        else:
            perplexities.append(float("inf"))

    # Aggregate
    avg_ppl = sum(perplexities) / len(perplexities)
    var = sum((p - avg_ppl) ** 2 for p in perplexities) / len(perplexities)
    std_ppl = math.sqrt(var)

    return avg_ppl, std_ppl


# ---------- Usage Example ----------

def load_sequences_from_fasta(fasta_path, max_seqs=None):
    sequences = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        sequences.append(str(record.seq))
        if max_seqs and len(sequences) >= max_seqs:
            break
    return sequences

# load your test sequences
test_sequences = load_sequences_from_fasta(test_set_path)

# evaluate
print("Evaluating pretrained model...")
avg_pre, std_pre = compute_autoregressive_perplexity_multiple_runs(
    model_pretrained, tokenizer, test_sequences,
    batch_size=16, num_runs=10
)

print("Evaluating finetuned model...")
avg_fine, std_fine = compute_autoregressive_perplexity_multiple_runs(
    model_finetuned, tokenizer, test_sequences,
    batch_size=16, num_runs=10
)

print("\n----- PERPLEXITY RESULTS -----")
print(f"Pretrained Model: {avg_pre:.4f} ± {std_pre:.4f}")
print(f"Finetuned Model:  {avg_fine:.4f} ± {std_fine:.4f}")
print(f"Delta (Fine - Pre): {avg_fine - avg_pre:.4f}")
if avg_pre != 0:
    pct = (avg_fine - avg_pre) / avg_pre * 100
    print(f"Relative Change:  {pct:.2f}%")


Evaluating pretrained model...
Evaluating finetuned model...

----- PERPLEXITY RESULTS -----
Pretrained Model: 3.7176 ± 0.0000
Finetuned Model:  1.0250 ± 0.0000
Delta (Fine - Pre): -2.6926
Relative Change:  -72.43%
