In [1]:
cd /home/sdowell/scratch/Thesis/ADP1

/home/sdowell/scratch/Thesis/ADP1


In [2]:
# Import packages
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
import torch
from tqdm import tqdm
import math
from Bio import SeqIO 
import json
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import PreTrainedTokenizer

# Example inputs
base_model_name = "facebook/esm2_t30_150M_UR50D" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/esm2_dgoa_finetune_1/checkpoint-3000"

# Load models
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)

def compute_mlm_loss(model, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes the MLM (masked language model) cross entropy loss for a given sequence.
    
    Args:
        sequence: Protein sequence as a string.
        mask_prob: The probability of masking a token.
        device: torch.device to run the computation.
    
    Returns:
        loss: The MLM cross entropy loss.
    """
    model.to(device)
    model.train()  # or model.eval() if you don't want dropout, etc.
    
    # Tokenize the sequence
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    
    # Create labels as a copy of input_ids.
    labels = input_ids.clone()
    
    # Create a mask for positions to replace.
    # Generate random values in [0, 1) for each token.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    # Create a boolean mask for tokens to mask.
    mask = probability_matrix < mask_prob
    
    # For positions NOT selected for masking, set the corresponding label to -100 so they are ignored.
    labels[~mask] = -100
    
    # Replace the selected input positions with the mask token.
    mask_token_id = tokenizer.mask_token_id
    input_ids[mask] = mask_token_id
    
    # Forward pass: the model automatically computes the loss when labels are provided.
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss
    return loss

# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sequence = "MQWQTKLPLIAILRGITPDEALAHVGAVIDAGFDAVEIPLNSPQWEQSIPAIVDAYGDKALIGAGTVLKPEQVDALARMGCQLIVTPNIHSEVIRRAVGYGMTVCPGCATATEAFTALEAGAQALKIFPSSAFGPQYIKALKAVLPSDIAVFAVGGVTPENLAQWIDAGCAGAGLGSDLYRAGQSVERTAQQAAAFVKAYREAVQ"
loss = compute_mlm_loss(model_pretrained, sequence, mask_prob=0.15, device=device)
print(f"Pretrained MLM Cross Entropy Loss: {loss.item():.4f}")
loss = compute_mlm_loss(model_finetuned, sequence, mask_prob=0.15, device=device)
print(f"Finetuned MLM Cross Entropy Loss: {loss.item():.4f}")



2025-04-18 17:30:47.472310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-18 17:30:47.490832: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-18 17:30:47.490866: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 17:30:47.503816: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Pretrained MLM Cross Entropy Loss: 1.3070
Finetuned MLM Cross Entropy Loss: 0.0118


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np

def compute_token_prediction_accuracy(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes token prediction accuracy for a masked language model.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequence: A string representing the protein (or other) sequence.
        mask_prob: The probability with which to mask tokens (e.g., 0.15 for 15%).
        device: torch.device to run the computations.
    
    Returns:
        A tuple (accuracy, correct, total_masked) where:
          - accuracy is the fraction of masked tokens correctly predicted.
          - correct is the number of correct predictions.
          - total_masked is the total number of masked tokens.
    """
    # Tokenize the input sequence.
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    # Create a copy of the original tokens to serve as labels.
    labels = input_ids.clone()
    
    # Create a random mask for tokens according to mask_prob.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob
    
    # Replace the selected token positions in input_ids with the mask token ID.
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("The tokenizer does not have a mask token.")
    
    input_ids[mask_positions] = mask_token_id
    
    # Forward pass through the model.
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # shape: [batch_size, sequence_length, vocab_size]
    
    # Get predictions (top candidate from the logits) using argmax.
    predictions = torch.argmax(logits, dim=-1)
    
    # Consider only the masked positions.
    masked_labels = labels[mask_positions]
    masked_predictions = predictions[mask_positions]
    
    # Calculate the number of correct predictions.
    correct = (masked_predictions == masked_labels).sum().item()
    total_masked = mask_positions.sum().item()
    
    accuracy = correct / total_masked if total_masked > 0 else 0.0
    return accuracy, correct, total_masked

sequence = (
    'MQWQTKLPLIAILRGITPDEALAHVGAVIDAGFDAVEIPLNSPQWEQSIPAIVDAYGDKA'
    'LIGAGTVLKPEQVDALARMGCQLIVTPNIHSEVIRRAVGYGMTVCPGCATATEAFTALEA'
    'GAQALKIFPSSAFGPQYIKALKAVLPSDIAVFAVGGVTPENLAQWIDAGCAGAGLGSDLY'
    'RAGQSVERTAQQAAAFVKAYREAVQ'
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accuracy, correct, total_masked = compute_token_prediction_accuracy(
    model_pretrained, tokenizer, sequence, mask_prob=0.15, device=device
)
print(f"pretrained Token Prediction Accuracy: {accuracy:.4f} ({correct}/{total_masked})")
accuracy, correct, total_masked = compute_token_prediction_accuracy(
    model_finetuned, tokenizer, sequence, mask_prob=0.15, device=device
)
print(f"finetuned Token Prediction Accuracy: {accuracy:.4f} ({correct}/{total_masked})")


pretrained Token Prediction Accuracy: 0.5000 (14/28)
finetuned Token Prediction Accuracy: 0.9545 (21/22)


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from Bio import SeqIO
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM
from peft import PeftModel, PeftConfig
from autoamp.evolveFinetune import *
import torch
from tqdm import tqdm
import math
from Bio import SeqIO 
import json
import warnings
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import PreTrainedTokenizer


base_model_name = "facebook/esm2_t30_150M_UR50D" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/esm2_dgoa_finetune_1/checkpoint-3000"

# Load models
model_pretrained = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_with_adapter = AutoModelForMaskedLM.from_pretrained(base_model_name)
model_finetuned = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint)

def compute_token_prediction_accuracy(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Computes token prediction accuracy for a given sequence from a masked language model.
    
    Args:
        model: The masked language model.
        tokenizer: The corresponding tokenizer.
        sequence: A string representing the protein (or other) sequence.
        mask_prob: The probability of masking a token (default is 15%).
        device: Torch device on which to run computations.
    
    Returns:
        A tuple (correct, total_masked) where:
          - correct: number of masked tokens correctly predicted.
          - total_masked: total number of masked tokens.
    """
    # Tokenize input sequence.
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    # Create a copy of input_ids for ground-truth labels.
    labels = input_ids.clone()
    
    # Create a random mask for tokens.
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob
    
    # Replace tokens at masked positions with the mask token.
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("The tokenizer does not have a mask token.")
    input_ids[mask_positions] = mask_token_id
    
    # Forward pass through the model.
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # shape: [batch, seq_length, vocab_size]
    
    # Get predicted token IDs.
    predictions = torch.argmax(logits, dim=-1)
    
    # Evaluate only on masked positions.
    masked_labels = labels[mask_positions]
    masked_predictions = predictions[mask_positions]
    
    correct = (masked_predictions == masked_labels).sum().item()
    total_masked = mask_positions.sum().item()
    
    return correct, total_masked

# ----- Main script to process FASTA file with a progress bar -----

fasta_file = "/home/sdowell/scratch/Thesis/ADP1/finetuning_data/test/dgoa_mutants_test.fasta"


# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

total_correct = 0
total_masked = 0
n_sequences = 0

# Count number of records in FASTA for progress bar (optional, if file is large)
records = list(SeqIO.parse(fasta_file, "fasta"))
n_records = len(records)

# Iterate over sequences with a progress bar.
for record in tqdm(records, desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    correct, masked = compute_token_prediction_accuracy(model_pretrained, tokenizer, seq, mask_prob=0.15, device=device)
    total_correct += correct
    total_masked += masked
    n_sequences += 1

# Compute overall accuracy.
overall_accuracy = total_correct / total_masked if total_masked > 0 else 0.0

print(f"\nProcessed {n_sequences} sequences.")
print(f"pretrained Overall Token Prediction Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_masked})")



total_correct = 0
total_masked = 0
n_sequences = 0

# Count number of records in FASTA for progress bar (optional, if file is large)
records = list(SeqIO.parse(fasta_file, "fasta"))
n_records = len(records)

# Iterate over sequences with a progress bar.
for record in tqdm(records, desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    correct, masked = compute_token_prediction_accuracy(model_finetuned, tokenizer, seq, mask_prob=0.15, device=device)
    total_correct += correct
    total_masked += masked
    n_sequences += 1

# Compute overall accuracy.
overall_accuracy = total_correct / total_masked if total_masked > 0 else 0.0

print(f"\nProcessed {n_sequences} sequences.")
print(f"finetuned Overall Token Prediction Accuracy: {overall_accuracy:.4f} ({total_correct}/{total_masked})")


Processing sequences: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1953/1953 [00:51<00:00, 37.93it/s]



Processed 1953 sequences.
pretrained Overall Token Prediction Accuracy: 0.5534 (33101/59819)


Processing sequences: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1953/1953 [01:07<00:00, 28.72it/s]


Processed 1953 sequences.
finetuned Overall Token Prediction Accuracy: 0.9831 (58754/59761)





# ESM-2 Pretrained Recall, Precision, F1

In [5]:
def compute_token_predictions_for_metrics(model, tokenizer, sequence, mask_prob=0.15, device=torch.device("cpu")):
    """
    Returns token-level predictions and labels for masked positions, to support precision/recall/F1.
    
    Returns:
        Two lists: true token IDs and predicted token IDs at masked positions.
    """
    encoded = tokenizer(sequence, return_tensors="pt")
    input_ids = encoded.input_ids.to(device)
    attention_mask = encoded.attention_mask.to(device)
    
    labels = input_ids.clone()
    probability_matrix = torch.rand(input_ids.shape).to(device)
    mask_positions = probability_matrix < mask_prob

    # Ensure mask token is defined
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id is None:
        raise ValueError("Tokenizer does not have a mask token.")
    input_ids[mask_positions] = mask_token_id

    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)

    true_tokens = labels[mask_positions].tolist()
    predicted_tokens = predictions[mask_positions].tolist()

    return true_tokens, predicted_tokens

from sklearn.metrics import classification_report

all_trues = []
all_preds = []

# Use same file path and device setup as before
for record in tqdm(SeqIO.parse(fasta_file, "fasta"), desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    trues, preds = compute_token_predictions_for_metrics(model_pretrained, tokenizer, seq, mask_prob=0.15, device=device)
    all_trues.extend(trues)
    all_preds.extend(preds)

# Optionally convert to token strings for readability
id2token = tokenizer.convert_ids_to_tokens
true_tokens = [id2token(t) for t in all_trues]
pred_tokens = [id2token(p) for p in all_preds]

# Print precision, recall, F1 for each amino acid
print(classification_report(true_tokens, pred_tokens, zero_division=0, digits=10))


Processing sequences: 1953it [00:51, 38.08it/s]


              precision    recall  f1-score   support

       <cls>  1.0000000000 1.0000000000 1.0000000000       290
       <eos>  1.0000000000 1.0000000000 1.0000000000       296
           A  0.5616942910 0.6894780107 0.6190607990      9732
           C  0.0000000000 0.0000000000 0.0000000000      1166
           D  0.3831498730 0.3662484824 0.3745085868      2471
           E  0.5674479167 0.7239202658 0.6362043796      3010
           F  0.3720298710 0.2870612886 0.3240685985      1909
           G  0.9081651895 0.8520877565 0.8792332268      5652
           H  0.0318471338 0.0097656250 0.0149476831       512
           I  0.5869186047 0.4847539016 0.5309664694      4165
           K  0.3742603550 0.3367346939 0.3545072396      2254
           L  0.4505844846 0.6184621796 0.5213419989      4799
           M  0.8672566372 0.5147058824 0.6460118655       952
           N  0.6695880806 0.6430976431 0.6560755689      1188
           P  0.6539114043 0.8523341523 0.7400533333      4070


# ESM-2 fine-tuned Recall, Precision, F1

In [6]:

all_trues = []
all_preds = []

# Use same file path and device setup as before
for record in tqdm(SeqIO.parse(fasta_file, "fasta"), desc="Processing sequences"):
    seq = str(record.seq).strip()
    if not seq:
        continue
    trues, preds = compute_token_predictions_for_metrics(model_finetuned, tokenizer, seq, mask_prob=0.15, device=device)
    all_trues.extend(trues)
    all_preds.extend(preds)

# Optionally convert to token strings for readability
id2token = tokenizer.convert_ids_to_tokens
true_tokens = [id2token(t) for t in all_trues]
pred_tokens = [id2token(p) for p in all_preds]

# Print precision, recall, F1 for each amino acid
print(classification_report(true_tokens, pred_tokens, zero_division=0, digits=10))


Processing sequences: 1953it [01:07, 28.76it/s]


              precision    recall  f1-score   support

       <cls>  1.0000000000 1.0000000000 1.0000000000       301
       <eos>  1.0000000000 0.9393939394 0.9687500000       297
       <unk>  1.0000000000 1.0000000000 1.0000000000         1
           A  0.9831342235 0.9894917652 0.9863027495      9897
           C  0.9927338783 0.9927338783 0.9927338783      1101
           D  0.9618627057 0.9827727646 0.9722053155      2438
           E  0.9808580858 0.9779532741 0.9794035261      3039
           F  0.9685777288 0.9436090226 0.9559303591      1862
           G  0.9952188305 0.9955849890 0.9954018760      5436
           H  0.9910514541 0.8618677043 0.9219562955       514
           I  0.9908844543 0.9816939224 0.9862677783      4097
           K  0.9795454545 0.9755545496 0.9775459288      2209
           L  0.9943181818 0.9913973982 0.9928556419      4766
           M  0.9894291755 0.9801047120 0.9847448711       955
           N  0.9803082192 0.9454995871 0.9625893232      1211
