# Fine-Tune an LLM for Antibody Sequence Generation

Model: meta-llama/Llama-3.1-8B-Instruct

In [18]:
# pip install -r ../requirements.txt

In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
import torch
import os, re
from transformers import TrainerCallback

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Test your GPU setup
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")

Using device: cuda
Number of GPUs: 1
GPU 0: NVIDIA H100 NVL
Memory: 99.9 GB


In [20]:
## Load dataset
df = pd.read_csv("../data/sabdab/sabdab_training_dataset.csv")

df.columns

Index(['pdb_id', 'h_chain_id', 'l_chain_id', 'antigen_ids', 'h_chain_seq',
       'l_chain_seq', 'antigen_seqs', 'antibody_seqs', 'h_chain_fv_seq',
       'l_chain_fv_seq', 'antibody_fv_seqs', 'highlighted_epitope_seqs',
       'epitope_residues'],
      dtype='object')

In [21]:
## Remove rows with missing sequences
df = df.dropna(subset=['h_chain_seq', 'l_chain_seq', 'antigen_seqs', 'highlighted_epitope_seqs'])

df.head()

Unnamed: 0,pdb_id,h_chain_id,l_chain_id,antigen_ids,h_chain_seq,l_chain_seq,antigen_seqs,antibody_seqs,h_chain_fv_seq,l_chain_fv_seq,antibody_fv_seqs,highlighted_epitope_seqs,epitope_residues
0,8xa4,C,D,A|B,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,EIVLTQSPGTLSLSPGERVTLSCRASQRVSSTYLAWYQQKPGQAPR...,SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWL...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,EIVLTQSPGTLSLSPGERVTLSCRASQRVSSTYLAWYQQKPGQAPR...,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,SCNGLYYQGSCYI[L]HSD[Y]KSFEDAKANCAAESSTLPNKSDVL...,A:ARG 176|A:ASP 146|A:ASP 150|A:ASP 170|A:GLN ...
1,9cph,H,L,A,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,AQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLL...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,AQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLL...,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...,A:ALA 1116|A:ALA 1122|A:ALA 1128|A:ALA 900|A:A...
2,9d7i,H,G,E,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,E:ARG 429|E:ARG 469|E:ASN 177|E:ASN 197|E:ASN ...
3,9d7i,J,I,C,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,C:ARG 469|C:ASN 197|C:ASN 280|C:ASN 425|C:ASP ...
4,9d7o,H,G,E,QVQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLE...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,QVQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLE...,QVQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLE...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,QVQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLE...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...,E:ARG 429|E:ARG 469|E:ASN 197|E:ASN 280|E:ASN ...


In [22]:
!export HF_HOME="/mnt/batch/tasks/shared/LS_root/mounts/clusters/colby-h100-01-ci/code/.cache/huggingface/"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
## Load base tokenizer and model FIRST
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16, # Load model in bfloat16 for better performance
    cache_dir="/mnt/batch/tasks/shared/LS_root/mounts/clusters/colby-h100-01-ci/code/.cache/huggingface/"
)

Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–

In [24]:
## Add epitope tokens
epitope_tokens = ["<epi>", "</epi>"]
tokenizer.add_special_tokens({"additional_special_tokens": epitope_tokens})

## Add amino acid tokens
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
extra_tokens = amino_acids + ["|"]
new_tokens = [t for t in extra_tokens if t not in tokenizer.get_vocab()]
tokenizer.add_tokens(new_tokens)

## Add task-specific tokens
task_tokens = ["Antigen", "Antibody"]
tokenizer.add_tokens(task_tokens)

## Set pad token
tokenizer.pad_token = tokenizer.eos_token

## Resize model embeddings ONCE after adding all tokens
model.resize_token_embeddings(len(tokenizer))
model.train()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128260, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [25]:
## Epitope and Prompt Formatter function
def format_prompt(example):
    epitope_seq = re.sub(r'\[([A-Z])\]', r'<epi>\1</epi>', example['highlighted_epitope_seqs'])
    return {
        "text": f"Antigen: {epitope_seq}<|im_end|>\nAntibody: {example['antibody_fv_seqs']}<|im_end|>\n"
    }

## Create dataset with all tokens available
dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_prompt)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ

In [26]:
# Check truncation at 800
sequence_lengths = [len(tokenizer(example["text"], truncation=False)["input_ids"]) for example in dataset]
truncated_800 = sum(1 for length in sequence_lengths if length > 800)
print(f"Sequences truncated at max_length=800: {truncated_800}/{len(sequence_lengths)} ({100*truncated_800/len(sequence_lengths):.1f}%)")

Sequences truncated at max_length=800: 267/9523 (2.8%)


In [27]:
## Tokenize the dataset
def tokenize(example):
    encoded = tokenizer(example["text"], truncation=True, max_length=800)
    # Make sure labels are a proper list, not nested
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

tokenized_dataset = dataset.map(tokenize)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ

In [28]:
# Verify tokenization is working with epitope tokens
print("Sample tokenized text:")
sample_tokens = tokenizer.tokenize(dataset[0]['text'][:200])
print(sample_tokens)

Sample tokenized text:
['Antigen', ':', 'Ä SCN', 'GL', 'YY', 'Q', 'G', 'SC', 'Y', 'I', '<epi>', 'L', '</epi>', 'H', 'SD', '<epi>', 'Y', '</epi>', 'K', 'SF', 'ED', 'AK', 'AN', 'CAA', 'ES', 'ST', 'LP', 'NK', 'SD', 'VL', 'TT', 'W', 'LI', '<epi>', 'D', '</epi>', '<epi>', 'Y', '</epi>', 'V', '<epi>', 'E', '</epi>', '<epi>', 'D', '</epi>', '<epi>', 'T', '</epi>', 'WG', 'SD', 'GN', 'P', 'IT', 'K', 'TT', 'SD', '<epi>', 'Y', '</epi>', 'Q', 'DS', '<epi>', 'D', '</epi>', 'VS', '<epi>', 'Q', '</epi>', '<epi>', 'E']


In [29]:
# Remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns([
    'pdb_id', 'h_chain_id', 'l_chain_id', 'antigen_ids', 'antigen_seqs',
    'h_chain_seq', 'l_chain_seq', 'antibody_seqs',
    'highlighted_epitope_seqs', 'epitope_residues','h_chain_fv_seq',
       'l_chain_fv_seq', 'antibody_fv_seqs', 'text'
])
print("Columns after removal:", tokenized_dataset.column_names)
# Should show: ['input_ids', 'attention_mask', 'labels']


Columns after removal: ['input_ids', 'attention_mask', 'labels']


In [30]:
# Apply the gradient fix to your model
if hasattr(model, 'enable_input_require_grads'):
    model.enable_input_require_grads()
else:
    def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)
    model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

In [31]:
# # Create data collator
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False,
#     return_tensors="pt",
#     pad_to_multiple_of=8, # Pad to multiple of 8 for better performance on GPUs
# )
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    return_tensors="pt",
)

In [32]:
 # Configure LoRA
## PEFT configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    # target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
    target_modules=["o_proj", "qkv_proj"],
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

trainable params: 2,097,152 || all params: 8,032,391,168 || trainable%: 0.0261


In [33]:
## Training Arguments
training_args = TrainingArguments(
    # output_dir=f"../models/peleke-{model_name.split('/')[-1]}-0806025",
    output_dir=f"../models/peleke-llama-3.1-8b-instruct",
    per_device_train_batch_size=9,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    warmup_steps=25,
    weight_decay=0.01,
    learning_rate=2e-4,
    logging_dir="../logs",
    logging_steps=25,
    gradient_checkpointing=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none", #"wandb",  # Enable wandb reporting
    run_name=f"lora-epitope-{model_name.split('/')[-1]}",  # Run name for wandb
    # optim="adamw_torch",
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=8,  # Add parallel data loading
    dataloader_pin_memory=True,  # Pin memory for faster data loading
    remove_unused_columns=False,
    max_grad_norm=1.0,
)

In [34]:
import re

def convert_brackets_to_epi(sequence):
    """Convert [X] format to <epi>X</epi> format"""
    return re.sub(r'\[([A-Z])\]', r'<epi>\1</epi>', sequence)

# Convert your bracket sequences to the training format
sequences_with_brackets = [
    "KVFGRCELAAAM[K][R]HGL[D][N][Y]RG[Y][S]LG[N]WVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCA[K]KIVSDGNGMNAWVAWRNRCK[G][T][D]V[Q]AW[I][R]GCRL",
    "NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVI[R]G[N]EV[S][Q]IAPGQ[T]GNIADYNYKLPDDFTGCVIAWNSN[K]LDSKPSGNYNYLYRLLRKSKLKPFERDISTEIYQAGNKPCNGVAGPNCYSPLQSYGF[R]P[T][Y][G][V]GH[Q]PYRVVVLSFELLHAPATVCGP",
]

# Convert to the exact training format
test_antigens = [convert_brackets_to_epi(seq) for seq in sequences_with_brackets]

# Verify the conversion
for i, (orig, conv) in enumerate(zip(sequences_with_brackets, test_antigens)):
    print(f"=== Sequence {i+1} ===")
    print(f"Original: {orig[:60]}...")
    print(f"Converted: {conv[:60]}...")
    print("-" * 60)

print(f"\nFinal test_antigens for training callback:")
for i, antigen in enumerate(test_antigens):
    print(f"Test {i+1}: {antigen[:80]}...")




=== Sequence 1 ===
Original: KVFGRCELAAAM[K][R]HGL[D][N][Y]RG[Y][S]LG[N]WVCAAKFESNFNTQATN...
Converted: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
------------------------------------------------------------
=== Sequence 2 ===
Original: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Converted: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
------------------------------------------------------------

Final test_antigens for training callback:
Test 1: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</epi><epi>Y</epi>RG<ep...
Test 2: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVI<epi>R</epi...


In [35]:
from transformers import TrainerCallback
import torch
from datetime import datetime
import os

class TestGenerationCallback(TrainerCallback):
    def __init__(self, model, tokenizer, test_antigens, log_every_n_steps=100, output_file="test_generations.txt"):
        self.model = model
        self.tokenizer = tokenizer
        self.test_antigens = test_antigens
        self.log_every_n_steps = log_every_n_steps
        self.output_file = output_file
        
        # Create/clear the output file
        with open(self.output_file, 'w') as f:
            f.write(f"Test Generation Log - Started: {datetime.now()}\n")
            f.write("="*80 + "\n\n")
    
    def create_test_prompt(self, antigen_with_epitopes):
        return f"Antigen: {antigen_with_epitopes}<|im_end|>\nAntibody:"
    
    def generate_antibody_test(self, antigen_with_epitopes, max_length=800):
        """Generate antibody for testing during training"""
        prompt = self.create_test_prompt(antigen_with_epitopes)
        
        # Tokenize
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
        # Generate
        self.model.eval()
        try:
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.convert_tokens_to_ids("<|im_end|>"),
                    repetition_penalty=1.1,
                )
                
                # Decode and extract antibody
                generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
                if "Antibody:" in generated_text:
                    antibody_part = generated_text.split("Antibody:", 1)[1]
                    if "<|im_end|>" in antibody_part:
                        antibody_sequence = antibody_part.split("<|im_end|>", 1)[0].strip()
                    else:
                        antibody_sequence = antibody_part.strip()
                else:
                    antibody_sequence = "Generation failed"
                
                return antibody_sequence
                
        except Exception as e:
            return f"Error: {str(e)}"
        finally:
            self.model.train()  # Put model back in training mode
    
    def run_test_generation(self, state, phase="TRAINING"):
        """Run test generation and print/save results"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        header = f"TEST GENERATION - {phase} - STEP {state.global_step} - {timestamp}"
        
        # Print to terminal
        print(f"\n{'='*80}")
        print(header)
        print(f"{'='*80}")
        
        # Write to file
        with open(self.output_file, 'a') as f:
            f.write(f"\n{'='*80}\n")
            f.write(f"{header}\n")
            f.write(f"{'='*80}\n")
        
        for i, test_antigen in enumerate(self.test_antigens):
            case_header = f"--- Test Case {i+1} ---"
            input_display = f"Input: {test_antigen[:60]}{'...' if len(test_antigen) > 60 else ''}"
            
            # Generate antibody
            antibody = self.generate_antibody_test(test_antigen)
            generated_display = f"Generated: {antibody}"
            
            # Print to terminal
            print(f"\n{case_header}")
            print(input_display)
            print(generated_display)
            
            # Write to file (with full input)
            with open(self.output_file, 'a') as f:
                f.write(f"\n{case_header}\n")
                f.write(f"Full Input: {test_antigen}\n")
                f.write(f"Generated: {antibody}\n")
                f.write(f"Length: {len(antibody)} characters\n")
        
        # Terminal footer
        print(f"{'='*80}\n")
        
        # File footer
        with open(self.output_file, 'a') as f:
            f.write(f"{'='*80}\n\n")
    
    def on_train_begin(self, args, state, control, **kwargs):
        """Test at the beginning of training"""
        print("ðŸ§¬ INITIAL GENERATION TEST (Before Training)")
        self.run_test_generation(state, "INITIAL")
    
    def on_log(self, args, state, control, **kwargs):
        """Test periodically during training"""
        if state.global_step % self.log_every_n_steps == 0 and state.global_step > 0:
            self.run_test_generation(state, "PERIODIC")
    
    def on_train_end(self, args, state, control, **kwargs):
        """Test at the end of training"""
        print("ðŸŽ‰ FINAL GENERATION TEST (After Training)")
        self.run_test_generation(state, "FINAL")
        
        # Add summary to file
        with open(self.output_file, 'a') as f:
            f.write(f"\nTraining completed: {datetime.now()}\n")
            f.write(f"Final step: {state.global_step}\n")


In [36]:
## Create the callback
test_callback = TestGenerationCallback(
    model=model, 
    tokenizer=tokenizer, 
    test_antigens=test_antigens,
    log_every_n_steps=50,  ## Test every 50 steps
    output_file="../logs/test_generations.txt"  ## Save to logs directory
)

In [37]:
## Save tokenizer
tokenizer.save_pretrained(training_args.output_dir)

('../models/peleke-llama-3.1-8b-instruct/tokenizer_config.json',
 '../models/peleke-llama-3.1-8b-instruct/special_tokens_map.json',
 '../models/peleke-llama-3.1-8b-instruct/chat_template.jinja',
 '../models/peleke-llama-3.1-8b-instruct/tokenizer.json')

In [None]:
## Set up SFTTrainer
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    args=training_args,
    callbacks=[test_callback],  # Log every 50 steps
)

## Start training
print("Starting training...")
trainer.train()

# Finish wandb run
#wandb.finish()

## Save the trained model
trainer.save_model(training_args.output_dir)
print(f"Model saved to {training_args.output_dir}")

Truncating train dataset: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ

Starting training...
ðŸ§¬ INITIAL GENERATION TEST (Before Training)

TEST GENERATION - INITIAL - STEP 0 - 2025-08-15 01:20:45

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: <|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|eot_id|>: KVFGRCELAAAM<|eot_id|><|start_header_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|start_header_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|start_header_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|end_header_id|>: KVFGRCELAAAM<|end_header_id|>: KVFGRCELAAAM<|eot_id|>: KVFGRCELAAAM<|end_header_id|>: KVFGRCELAAAM<|eot_id|>:<|eot_id|><|start_header_id|>: KVFGRCELAAAM<|eot_id|>: KVFGR

--- Test Case 2 ---
Input: NLCP

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
25,6.1476
50,4.6692
75,4.3261
100,3.9883
125,3.6971
150,3.6749
175,3.5866
200,3.6845
225,3.7904
250,3.414



TEST GENERATION - PERIODIC - STEP 50 - 2025-08-15 01:24:29

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: VQLQQSGAEVVKPSSTISCVGGSHGWSWVRQAPGKGLEWIGGIYYDMHWVRQAPEGGVVSWVRQAPGKGLEWVGALYYCMVWVDYPIGQSRTLLSCTFGSGSTSVTDTSTAYMELSSLRSEDEADYYCQQGNPFYAAPFPATLTVSSGASTTGSIYSLQSLSITEDFATYFTWLNPGDSSTYSMSSTPLTTGVLTFLSSDVSNKTVPKKLVQCPAPGLEILGNVSWFAGHQPYTPTTFGTCEVQHVDTSKLGSNVTLCISCRNPKPFFYPGKPVTLDRFSCATSNNYRPDTRFSGSGTEFTDTSTYYMSWVRQAPGKGLEWVGVYYCHWGDGTDITLTFLSPPSSEDSVKNFK

--- Test Case 2 ---
Input: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Generated: QVQLVESGGGVVSGGSGLDIAGLTCTSSSLTWNSQSLQSPGKVKPGTSVKWSCTASGYTNVTWGPRGDITLGWASIGDISQSNNLEWTEDFGAQPKNMSLNSEDTAMYYCARGDGSSVTVSS


TEST GENERATION - PERIODIC - STEP 100 - 2025-08-15 01:27:57

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: QSVLTQPPSLAVSGGARVTISCRASEDEADYYCLYWDLQSPGDPSRFSGSKSGNTAYLQLSSLTRPEDFASYFCQQVTVSSEDSALWTQRPE




TEST GENERATION - PERIODIC - STEP 550 - 2025-08-15 02:01:22

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: VQLVESGGGVVQPGRSLRLSCAASGFNIKNYMNHWVRQAPGKGLEWMGISPYSGRTNYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYPGYAMDYWGQGTLVTVSA|EIVLTQSPLTLSVTIGQPASISCKSSQSLLHSNGNTYLHWYLQRPGQSPKLIYKVSNRFSGVPDRFSGSGSGTDFTLKISRVEAEDLGVYYCFQGSHWPPTFGQGTKVEIK

--- Test Case 2 ---
Input: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Generated: EVQLVESGGGLVQPGGSLRLSCAASGFTFSNNTMTWVRQSPGKGLEWVSGISTGRFGTRYNQKFKDKATLTADKSSSTAYMELKSRLTSRDVTSINTLYLRSEDTAVYYCARSSGYDRGVWGQGTLVTVSA|DIQMTQSPSTLSASVGDRVTITCRASESVGSYGMSWYQQKPGKAPKLLIYSASSLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQYNNYPITFGQGTKVEIK


TEST GENERATION - PERIODIC - STEP 600 - 2025-08-15 02:05:02

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: QVQLVESGGGVVKPGGSLKLSCAASGFTFSNYGMHWVRQTPEKRLEWVAYISSGGSYYYSSTLQPPSRISLTNLRSVSDDTAVYLRYTTSRRGDGYWG




TEST GENERATION - PERIODIC - STEP 1050 - 2025-08-15 02:38:50

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: EVQLVESGGGVVQPGRSLRLSCAASGFTFSNYGMHWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLKTEDTAVYYCARDGSGSYAMDYWGQGTLVTVSA|DIQMTQSPSSLSASVGDRVTITCRASQSISTWLAWYQQKPGKAPKLLIYKASTLKTGVPSRFSGSGSGTEFTLTISRLEPEDFAVYYCQQRSNLPLTFGGGTKVEIK

--- Test Case 2 ---
Input: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Generated: EVQLVESGGGLVKPGGSLRLSCAASGFTFSNYAMSWVRQTPEKRLEWVALSSGGSYTYYSDSVKGRFTISRDNARNILYLQMSSLKSEDTAMYYCARESGYYYDYWGQGTLVTVSA|DIVMTQSPLSLPVTPGEPASISCRSSQSLLHSNGYGTYYPDFSRAGVPDRFSGSGSGTDFTLKISRVEAEDLGVYYCMQALRSIPRTFGQGTKVDIK



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


TEST GENERATION - PERIODIC - STEP 1100 - 2025-08-15 02:42:34

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: EVQLVESGGGVVQPGRSLRLSCAASEFTFSFYGMHWVRQAPGKGLEWVAYISSSGGSTYYADSVKGRFTISRDNSKNTLYLQMRAEDTAVYYCARAGDLFGAMDYWGQGTLVTVSS|DIQMTQSPSSLSASVGDRVTITCRASQSIYSYLAWYQQKPGKAPKLLIYDASSLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQHYSTPRTFGQGTKVEIK

--- Test Case 2 ---
Input: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Generated: EVQLVESGGGLIQPGGSLRLSCAASGFTFSNYAMSWVRQAPGKGLEWVSVINSGGSTYYADSVKGRFTISRDNAKKNTLYLQMSSLRAEDTAVYYCARGYGPDYWGQGTLVTVSS|DIQMTQSPSSLSASVGDRVTITCRASQSISSWLAWYQQKPGKAPKLLIYDASSLKTGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYLTIPRTFGQGTKVEIK


TEST GENERATION - PERIODIC - STEP 1150 - 2025-08-15 02:46:16

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: QVQLVESGGGVVQPGRSLRLSCAASGFDFDSYAIIHWVRQAPGKGLEWVASISSYYGYTSYADSVKGRFTISRDNSKNTLYLQMNSLRVEDTAVYYCARERDYDDYWGQGTLVTVS




TEST GENERATION - PERIODIC - STEP 1550 - 2025-08-15 03:16:06

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: EVQLVESGGGLVKPGGSLKLSCAASGFTFSRYMYWVRPTKGLEWVARIHPDSGETAYADSVKGRFTISRDNSKNTLYLQMRAEDTAVYYCARSAAYYYYDYYYAMDYWGQGTSVTVSS|DIVMTQSPLSLPVTPGEPASISCRSSQSIKNYLHWLLQRPGQSPKRLIYKASSLESGVPARFSGSGSGTDFTLKISRVEAEDVGVYYCMQGSHWPRTFGQGTKLEIK

--- Test Case 2 ---
Input: NLCPFHEVFNATTFASVYAWNRKRISNCVADYSVIYNFAPFFAFKCYGVSPTKLNDLCFT...
Generated: QVQLQQSGPELVKPGASVKMSCKASGYTFTSDWIHWVKQRPGQGLEWIGEILPGSGSTNYNEKFKDKATLTADKSSNTAYMQLSSLTSEDSAVYYCAREGWYGPDYWGQGTTLTVSS|DIQMTQSPSTLSASVGDRVTITCRASQSISSWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSNEDPYTFGGGTKVEIK


TEST GENERATION - PERIODIC - STEP 1600 - 2025-08-15 03:19:42

--- Test Case 1 ---
Input: KVFGRCELAAAM<epi>K</epi><epi>R</epi>HGL<epi>D</epi><epi>N</e...
Generated: QLVQSGAEVKKPGASVKVSCKASGYTFTSYGISWVRQAPGQGLEWMGGIIPIFGTANYAQKFQGRVTMTVDTSISTSTAYMELRSEDTAVYYCARLGDYSWGFDIWGQGTLVT

### This script is used to clear vram. For testing purposes only and when you want to clear the GPU memory.

In [22]:
import gc
import torch
# Clear any existing models from GPU memory
torch.cuda.empty_cache()
gc.collect()

# Check current GPU memory usage
print(f"GPU Memory before: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
print(f"GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB reserved")
# If you have a model loaded, delete it first
try:
    del model
    torch.cuda.empty_cache()
    gc.collect()
    print("Previous model cleared from memory")
except:
    print("No previous model to clear")

GPU Memory before: 30.16 GB allocated
GPU Memory reserved: 30.29 GB reserved
Previous model cleared from memory
