In [8]:
cd /home/sdowell/scratch/Thesis/ADP1

/home/sdowell/scratch/Thesis/ADP1


In [1]:
# Import packages
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
import torch
from tqdm import tqdm
import math
from Bio import SeqIO 
import json
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import PreTrainedTokenizer

# Example inputs
base_model_name = "facebook/esm2_t6_8M_UR50D" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
adapter_checkpoint = "/home/sdowell/scratch/Thesis/distillation/distilled-esm2-8M/checkpoint-epoch-50/"

# Load models
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)

def compute_mlm_loss(model, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes the MLM (masked language model) cross entropy loss for a given sequence.
    
    Args:
        sequence: Protein sequence as a string.
        mask_prob: The probability of masking a token.
        device: torch.device to run the computation.
    
    Returns:
        loss: The MLM cross entropy loss.
    """
    model.to(device)
    model.train()  # or model.eval() if you don't want dropout, etc.
    
    # Tokenize the sequence
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    
    # Create labels as a copy of input_ids.
    labels = input_ids.clone()
    
    # Create a mask for positions to replace.
    # Generate random values in [0, 1) for each token.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    # Create a boolean mask for tokens to mask.
    mask = probability_matrix < mask_prob
    
    # For positions NOT selected for masking, set the corresponding label to -100 so they are ignored.
    labels[~mask] = -100
    
    # Replace the selected input positions with the mask token.
    mask_token_id = tokenizer.mask_token_id
    input_ids[mask] = mask_token_id
    
    # Forward pass: the model automatically computes the loss when labels are provided.
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    return loss

# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sequence = "MQWQTKLPLIAILRGITPDEALAHVGAVIDAGFDAVEIPLNSPQWEQSIPAIVDAYGDKALIGAGTVLKPEQVDALARMGCQLIVTPNIHSEVIRRAVGYGMTVCPGCATATEAFTALEAGAQALKIFPSSAFGPQYIKALKAVLPSDIAVFAVGGVTPENLAQWIDAGCAGAGLGSDLYRAGQSVERTAQQAAAFVKAYREAVQ"
loss = compute_mlm_loss(model_pretrained, sequence, mask_prob=0.15, device=device)
print(f"Pretrained MLM Cross Entropy Loss: {loss.item():.4f}")
loss = compute_mlm_loss(model_finetuned, sequence, mask_prob=0.15, device=device)
print(f"Finetuned MLM Cross Entropy Loss: {loss.item():.4f}")



2025-05-13 23:25:57.954318: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-13 23:25:57.970974: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-13 23:25:57.971003: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-13 23:25:57.982413: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Pretrained MLM Cross Entropy Loss: 1.8642
Finetuned MLM Cross Entropy Loss: 2.8570


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np

def compute_token_prediction_accuracy(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes token prediction accuracy for a masked language model.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequence: A string representing the protein (or other) sequence.
        mask_prob: The probability with which to mask tokens (e.g., 0.15 for 15%).
        device: torch.device to run the computations.
    
    Returns:
        A tuple (accuracy, correct, total_masked) where:
          - accuracy is the fraction of masked tokens correctly predicted.
          - correct is the number of correct predictions.
          - total_masked is the total number of masked tokens.
    """
    # Tokenize the input sequence.
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    # Create a copy of the original tokens to serve as labels.
    labels = input_ids.clone()
    
    # Create a random mask for tokens according to mask_prob.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob
    
    # Replace the selected token positions in input_ids with the mask token ID.
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("The tokenizer does not have a mask token.")
    
    input_ids[mask_positions] = mask_token_id
    
    # Forward pass through the model.
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # shape: [batch_size, sequence_length, vocab_size]
    
    # Get predictions (top candidate from the logits) using argmax.
    predictions = torch.argmax(logits, dim=-1)
    
    # Consider only the masked positions.
    masked_labels = labels[mask_positions]
    masked_predictions = predictions[mask_positions]
    
    # Calculate the number of correct predictions.
    correct = (masked_predictions == masked_labels).sum().item()
    total_masked = mask_positions.sum().item()
    
    accuracy = correct / total_masked if total_masked > 0 else 0.0
    return accuracy, correct, total_masked

sequence = (
    'MQWQTKLPLIAILRGITPDEALAHVGAVIDAGFDAVEIPLNSPQWEQSIPAIVDAYGDKA'
    'LIGAGTVLKPEQVDALARMGCQLIVTPNIHSEVIRRAVGYGMTVCPGCATATEAFTALEA'
    'GAQALKIFPSSAFGPQYIKALKAVLPSDIAVFAVGGVTPENLAQWIDAGCAGAGLGSDLY'
    'RAGQSVERTAQQAAAFVKAYREAVQ'
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accuracy, correct, total_masked = compute_token_prediction_accuracy(
    model_pretrained, tokenizer, sequence, mask_prob=0.15, device=device
)
print(f"pretrained Token Prediction Accuracy: {accuracy:.4f} ({correct}/{total_masked})")
accuracy, correct, total_masked = compute_token_prediction_accuracy(
    model_finetuned, tokenizer, sequence, mask_prob=0.15, device=device
)
print(f"finetuned Token Prediction Accuracy: {accuracy:.4f} ({correct}/{total_masked})")


pretrained Token Prediction Accuracy: 0.2812 (9/32)
finetuned Token Prediction Accuracy: 0.1935 (6/31)


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
import torch
from tqdm import tqdm
import math
from Bio import SeqIO 
import json
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import PreTrainedTokenizer


base_model_name = "facebook/esm2_t6_8M_UR50D" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load models
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)

def compute_token_prediction_accuracy(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes token prediction accuracy for a given sequence from a masked language model.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequence: A string representing the protein (or other) sequence.
        mask_prob: The probability of masking a token (default is 15%).
        device: Torch device on which to run computations.
    
    Returns:
        A tuple (correct, total_masked) where:
          - correct: number of masked tokens correctly predicted.
          - total_masked: total number of masked tokens.
    """
    # Tokenize input sequence.
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    # Create a copy of input_ids for ground-truth labels.
    labels = input_ids.clone()
    
    # Create a random mask for tokens.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob
    
    # Replace tokens at masked positions with the mask token.
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("The tokenizer does not have a mask token.")
    input_ids[mask_positions] = mask_token_id
    
    # Forward pass through the model.
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # shape: [batch, seq_length, vocab_size]
    
    # Get predicted token IDs.
    predictions = torch.argmax(logits, dim=-1)
    
    # Evaluate only on masked positions.
    masked_labels = labels[mask_positions]
    masked_predictions = predictions[mask_positions]
    
    correct = (masked_predictions == masked_labels).sum().item()
    total_masked = mask_positions.sum().item()
    
    return correct, total_masked

# ----- Main script to process FASTA file with a progress bar -----

fasta_file = "/home/sdowell/scratch/Thesis/BenchmarkingFinetuning/dataset_splits/finetuning_dataset/test.fasta"

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

total_correct = 0
total_masked = 0
n_sequences = 0

# Count number of records in FASTA for progress bar (optional, if file is large)
records = list(SeqIO.parse(fasta_file, "fasta"))
n_records = len(records)

# Iterate over sequences with a progress bar.
for record in tqdm(records, desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    correct, masked = compute_token_prediction_accuracy(model_pretrained, tokenizer, seq, mask_prob=0.15, device=device)
    total_correct += correct
    total_masked += masked
    n_sequences += 1

# Compute overall accuracy.
overall_accuracy = total_correct / total_masked if total_masked > 0 else 0.0

print(f"\nProcessed {n_sequences} sequences.")
print(f"pretrained Overall Token Prediction Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_masked})")



total_correct = 0
total_masked = 0
n_sequences = 0

# Count number of records in FASTA for progress bar (optional, if file is large)
records = list(SeqIO.parse(fasta_file, "fasta"))
n_records = len(records)

# Iterate over sequences with a progress bar.
for record in tqdm(records, desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    correct, masked = compute_token_prediction_accuracy(model_finetuned, tokenizer, seq, mask_prob=0.15, device=device)
    total_correct += correct
    total_masked += masked
    n_sequences += 1

# Compute overall accuracy.
overall_accuracy = total_correct / total_masked if total_masked > 0 else 0.0

print(f"\nProcessed {n_sequences} sequences.")
print(f"finetuned Overall Token Prediction Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_masked})")


Processing sequences: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 469/469 [00:09<00:00, 50.80it/s]



Processed 469 sequences.
pretrained Overall Token Prediction Accuracy: 0.1965 (9423/47957)


Processing sequences: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 469/469 [00:09<00:00, 49.52it/s]


Processed 469 sequences.
finetuned Overall Token Prediction Accuracy: 0.2446 (11686/47771)





# ESM-2 Pretrained Recall, Precision, F1

In [5]:
def compute_token_predictions_for_metrics(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Returns token-level predictions and labels for masked positions, to support precision/recall/F1.
    
    Returns:
        Two lists: true token IDs and predicted token IDs at masked positions.
    """
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    labels = input_ids.clone()
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob

    # Ensure mask token is defined
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("Tokenizer does not have a mask token.")
    input_ids[mask_positions] = mask_token_id

    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)

    true_tokens = labels[mask_positions].tolist()
    predicted_tokens = predictions[mask_positions].tolist()

    return true_tokens, predicted_tokens

from sklearn.metrics import classification_report

all_trues = []
all_preds = []

# Use same file path and device setup as before
for record in tqdm(SeqIO.parse(fasta_file, "fasta"), desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    trues, preds = compute_token_predictions_for_metrics(model_pretrained, tokenizer, seq, mask_prob=0.15, device=device)
    all_trues.extend(trues)
    all_preds.extend(preds)

# Optionally convert to token strings for readability
id2token = tokenizer.convert_ids_to_tokens
true_tokens = [id2token(t) for t in all_trues]
pred_tokens = [id2token(p) for p in all_preds]

# Print precision, recall, F1 for each amino acid
print(classification_report(true_tokens, pred_tokens, zero_division=0, digits=10))


Processing sequences: 469it [00:08, 54.97it/s]


              precision    recall  f1-score   support

       <cls>  1.0000000000 1.0000000000 1.0000000000        70
       <eos>  1.0000000000 1.0000000000 1.0000000000        59
           A  0.1623203832 0.1401654412 0.1504315660      4352
           C  0.4672897196 0.5223880597 0.4933051445       670
           D  0.2033755274 0.1916500994 0.1973387922      2515
           E  0.1417518652 0.3074617920 0.1940425532      3337
           F  0.2912621359 0.0665188470 0.1083032491      1804
           G  0.2363601311 0.3773468760 0.2906590801      3249
           H  0.7938144330 0.1015831135 0.1801169591       758
           I  0.3655462185 0.0763492760 0.1263157895      2279
           K  0.1733905579 0.1957996769 0.1839150228      3095
           L  0.1791778976 0.5633474576 0.2718813906      4720
           M  0.9241379310 0.1012849584 0.1825613079      1323
           N  0.5243243243 0.0562645012 0.1016238869      1724
           P  0.1956034096 0.1981818182 0.1968841725      2200


# ESM-2 fine-tuned Recall, Precision, F1

In [6]:

all_trues = []
all_preds = []

# Use same file path and device setup as before
for record in tqdm(SeqIO.parse(fasta_file, "fasta"), desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    trues, preds = compute_token_predictions_for_metrics(model_finetuned, tokenizer, seq, mask_prob=0.15, device=device)
    all_trues.extend(trues)
    all_preds.extend(preds)

# Optionally convert to token strings for readability
id2token = tokenizer.convert_ids_to_tokens
true_tokens = [id2token(t) for t in all_trues]
pred_tokens = [id2token(p) for p in all_preds]

# Print precision, recall, F1 for each amino acid
print(classification_report(true_tokens, pred_tokens, zero_division=0, digits=10))


Processing sequences: 469it [00:09, 48.83it/s]


              precision    recall  f1-score   support

       <cls>  1.0000000000 0.9661016949 0.9827586207        59
       <eos>  1.0000000000 0.5660377358 0.7228915663        53
           A  0.1571646084 0.4321935334 0.2305067924      4299
           C  0.8794326241 0.5462555066 0.6739130435       681
           D  0.5100671141 0.1871921182 0.2738738739      2436
           E  0.2095214481 0.2565270188 0.2306537464      3294
           F  0.2355263158 0.1028144744 0.1431427429      1741
           G  0.3323833274 0.2777612385 0.3026273111      3359
           H  0.2081447964 0.0589743590 0.0919080919       780
           I  0.2682682683 0.1178540018 0.1637641308      2274
           K  0.4443277311 0.1372040221 0.2096654275      3083
           L  0.2272926338 0.5626626193 0.3237881340      4612
           M  0.6120218579 0.1728395062 0.2695547533      1296
           N  0.3649815043 0.1734036321 0.2351072280      1707
           P  0.2342215989 0.0779281381 0.1169467787      2143


In [17]:
import torch
from tqdm import tqdm
from sklearn.metrics import classification_report
from Bio import SeqIO

# ---------- 1. Helper functions ----------

def prepare_masked_input(tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Prepares masked input_ids and labels for consistent evaluation.
    
    Returns:
        masked input_ids, ground truth labels, attention_mask, mask_positions
    """
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)

    labels = input_ids.clone()
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob

    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("Tokenizer does not have a mask token.")
    
    masked_input_ids = input_ids.clone()
    masked_input_ids[mask_positions] = mask_token_id

    return masked_input_ids, labels, attention_mask, mask_positions

def compute_predictions(model, input_ids, attention_mask):
    """
    Computes model predictions.
    
    Returns:
        predictions tensor
    """
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return predictions

# ---------- 2. Evaluation loop ----------

def evaluate_models(fasta_file, model_pretrained, model_finetuned, tokenizer, mask_prob=0.15, device=torch.device("cpu")):
    """
    Evaluates both models on the same masked inputs.
    """

    all_true_tokens = []

    all_preds_pretrained = []
    all_preds_finetuned = []

    for record in tqdm(SeqIO.parse(fasta_file, "fasta"), desc="Processing sequences"):
        seq = str(record.seq).strip()
        if not seq:
            continue

        # Step 1: Prepare masked inputs
        masked_input_ids, labels, attention_mask, mask_positions = prepare_masked_input(tokenizer, seq, mask_prob=mask_prob, device=device)

        # Step 2: Predictions
        preds_pretrained = compute_predictions(model_pretrained, masked_input_ids, attention_mask)
        preds_finetuned = compute_predictions(model_finetuned, masked_input_ids, attention_mask)

        # Step 3: Select only masked positions
        true_at_mask = labels[mask_positions].tolist()
        preds_pretrained_at_mask = preds_pretrained[mask_positions].tolist()
        preds_finetuned_at_mask = preds_finetuned[mask_positions].tolist()

        all_true_tokens.extend(true_at_mask)
        all_preds_pretrained.extend(preds_pretrained_at_mask)
        all_preds_finetuned.extend(preds_finetuned_at_mask)

    # ---------- 3. Generate reports ----------

    id2token = tokenizer.convert_ids_to_tokens

    true_tokens = [id2token(t) for t in all_true_tokens]
    preds_pretrained_tokens = [id2token(p) for p in all_preds_pretrained]
    preds_finetuned_tokens = [id2token(p) for p in all_preds_finetuned]

    print("=== Pretrained Model Report ===")
    print(classification_report(true_tokens, preds_pretrained_tokens, zero_division=0, digits=10))

    print("\n=== Finetuned Model Report ===")
    print(classification_report(true_tokens, preds_finetuned_tokens, zero_division=0, digits=10))

# ---------- 4. Usage example ----------

# Assuming you already have these:
# model_pretrained, model_finetuned, tokenizer, fasta_file, device

evaluate_models(fasta_file, model_pretrained, model_finetuned, tokenizer, mask_prob=0.15, device=device)


Processing sequences: 469it [00:36, 12.80it/s]


=== Pretrained Model Report ===
              precision    recall  f1-score   support

       <cls>  1.0000000000 1.0000000000 1.0000000000        67
       <eos>  1.0000000000 1.0000000000 1.0000000000        77
           A  0.4179775281 0.4384724187 0.4279797515      4242
           C  0.7202702703 0.7454545455 0.7326460481       715
           D  0.3835068054 0.3880113406 0.3857459231      2469
           E  0.3353526612 0.4807692308 0.3951057864      3224
           F  0.4971464807 0.4260869565 0.4588820603      1840
           G  0.5311727547 0.5985762922 0.5628637951      3231
           H  0.7008797654 0.3208053691 0.4401473297       745
           I  0.5675029869 0.4190560212 0.4821111393      2267
           K  0.2875275938 0.5024108004 0.3657423657      3111
           L  0.4108950453 0.6286937029 0.4969794603      4907
           M  0.7707509881 0.2950075643 0.4266958425      1322
           N  0.4785932722 0.1899271845 0.2719374457      1648
           P  0.4022503516 0.52