In [1]:
import torch
import math
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# --- 1. Setup & model loading ----------------------------------------------

base_model_name    = "hugohrban/progen2-small"
adapter_checkpoint = "/home/sdowell/scratch/Thesis/ADP1/runs/progen2_dgoa_finetune_1/checkpoint-3000"

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# pretrained & finetuned models
model_pretrained   = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)
model_with_adapter = AutoModelForCausalLM.from_pretrained(base_model_name).to(device)
model_finetuned    = PeftModel.from_pretrained(model_with_adapter, adapter_checkpoint).to(device)

# --- 2. Sequence & mutations -----------------------------------------------

DgoA_seq  = (
    "MQWQTKLPLIAILRGITPDEALAHVGAVIDAGFDAVEIPLNSPQWEQSIPAIVDAYGDKA"
    "LIGAGTVLKPEQVDALARMGCQLIVTPNIHSEVIRRAVGYGMTVCPGCATATEAFTALEA"
    "GAQALKIFPSSAFGPQYIKALKAVLPSDIAVFAVGGVTPENLAQWIDAGCAGAGLGSDLY"
    "RAGQSVERTAQQAAAFVKAYREAVQ"
)
mutations = ['F33I','D58N','A75V','Q72H','V85A','V154F','Y180F']

# --- 3. Utility functions --------------------------------------------------

def parse_mutation(mut_str):
    """Parse mutation like 'F33I' â†’ (wt='F', pos=33, mut='I')"""
    wt      = mut_str[0]
    pos     = int(mut_str[1:-1])
    mutant  = mut_str[-1]
    return wt, pos, mutant

def compute_mutation_llr_autoregressive(model, tokenizer, sequence, mutation, device):
    """
    Returns (llr, log_prob_wt, log_prob_mut) for an AR model.
    """
    wt, pos, mut = parse_mutation(mutation)
    idx   = pos - 1  # zeroâ€‘based
    if sequence[idx] != wt:
        print(f"Warning: expected {wt} at {pos}, got {sequence[idx]}")
    
    # prefix up toâ€”but not includingâ€”the mutated position
    prefix = sequence[:idx]
    inputs = tokenizer(prefix, return_tensors="pt").to(device)
    
    model.eval()
    with torch.no_grad():
        logits = model(**inputs).logits  # [1, L, V]
        next_logits = logits[0, -1, :]   # [V]
        probs = torch.softmax(next_logits, dim=-1)
    
    # convert singleâ€‘letter tokens to IDs
    wt_id  = tokenizer.convert_tokens_to_ids(wt)
    mut_id = tokenizer.convert_tokens_to_ids(mut)
    
    # avoid log(0)
    eps = 1e-12
    p_wt  = probs[wt_id].item()
    p_mut = probs[mut_id].item()
    lp_wt  = math.log(p_wt  + eps)
    lp_mut = math.log(p_mut + eps)
    
    llr = lp_mut - lp_wt
    return llr, lp_wt, lp_mut

# --- 4. Compute & report -----------------------------------

records = []
for mutation in mutations:
    for model, label in [
        (model_pretrained, "ProGen2 (pretrained)"),
        (model_finetuned,  "ProGen2 (finetuned)")
    ]:
        llr, lp_wt, lp_mut = compute_mutation_llr_autoregressive(
            model, tokenizer, DgoA_seq, mutation, device
        )
        records.append({
            "Model":     label,
            "Mutation":  mutation,
            "logP(wt)":  f"{lp_wt:.6f}",
            "logP(mut)": f"{lp_mut:.6f}",
            "LLR":       f"{llr:.6f}"
        })

df = pd.DataFrame(records, columns=["Model","Mutation","logP(wt)","logP(mut)","LLR"])
print(df.to_markdown(index=False))



2025-04-18 18:22:40.378910: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-18 18:22:40.396502: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-18 18:22:40.396534: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-18 18:22:40.408757: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


The repository for hugohrban/progen2-small contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/hugohrban/progen2-small.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository for hugohrban/progen2-small contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/hugohrban/progen2-small.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repository for hugohrban/progen2-small contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/hugohrban/progen2-small.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y
The repositor

ProGenForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


| Model                | Mutation   |   logP(wt) |   logP(mut) |       LLR |
|:---------------------|:-----------|-----------:|------------:|----------:|
| ProGen2 (pretrained) | F33I       |  -1.9043   |   -1.46858  |  0.435722 |
| ProGen2 (finetuned)  | F33I       |  -0.179315 |   -1.87186  | -1.69254  |
| ProGen2 (pretrained) | D58N       |  -1.77341  |   -2.95379  | -1.18038  |
| ProGen2 (finetuned)  | D58N       |  -0.198515 |   -1.74838  | -1.54987  |
| ProGen2 (pretrained) | A75V       |  -1.54974  |   -4.85515  | -3.30541  |
| ProGen2 (finetuned)  | A75V       |  -0.514432 |   -0.942083 | -0.42765  |
| ProGen2 (pretrained) | Q72H       |  -0.883275 |   -3.46503  | -2.58176  |
| ProGen2 (finetuned)  | Q72H       |  -0.248708 |   -1.52757  | -1.27886  |
| ProGen2 (pretrained) | V85A       |  -0.103677 |   -5.3091   | -5.20542  |
| ProGen2 (finetuned)  | V85A       |  -0.485737 |   -0.964421 | -0.478683 |
| ProGen2 (pretrained) | V154F      |  -0.33779  |   -5.13948  | -4.80169  |