In [1]:
from peft import PeftModel
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model
model_name = "microsoft/phi-4" #"microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()
model = PeftModel.from_pretrained(base_model, "/home/nicholas/Documents/GitHub/peleke/models/peleke-phi-4/checkpoint-1434/").cuda()
model.eval()

# Generate complete antibody sequences
test_antigens = [
    "DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV",
 
]

for antigen in test_antigens:
    prompt = f"Antigen: {antigen}\nAntibody: "
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False,
        )
    
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    antibody_sequence = full_text.replace(prompt, "").strip()
    
    print(f"Antigen: {antigen}")
    print(f"Antibody: {antibody_sequence}")
    print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00, 199.46it/s]


Antigen: DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV
Antibody: VQLQESGPGLVAPSQSLSITCTVSGFSLTFSSNYMWVRQAPGKGLEWLGRIYSGSTYRPSVTKISASTQSVYSLDTSKNQFSLKLTAEDTAVYYCARRGSGNYDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSS
--------------------------------------------------


In [3]:
# Test on CPU to avoid GPU issues
model_cpu = model.cpu()

test_antigen = "DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV"
prompt = f"Antigen: {test_antigen}\nAntibody: "
inputs = tokenizer(prompt, return_tensors="pt")

with torch.no_grad():
    outputs = model_cpu.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=False,  # Greedy decoding
        pad_token_id=tokenizer.eos_token_id,
    )

full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
antibody = full_text.replace(prompt, "").strip()

print(f"Antigen: {test_antigen}")
print(f"Antibody: {antibody}")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


AttributeError: 'DynamicCache' object has no attribute 'get_max_length'

In [1]:
import pandas as pd
from datasets import Dataset
## Load dataset
df = pd.read_csv("./data/sabdab/sabdab_with_sequences.tsv", sep='\t')

## Remove rows with missing sequences
df = df.dropna(subset=['HeavySeq', 'LightSeq', 'AntigenSeq'])

df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,pdb,Hchain,Lchain,AntigenChains,HeavySeq,LightSeq,AntigenSeq
4,8xa4,C,D,A | B,QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKG...,EIVLTQSPGTLSLSPGERVTLSCRASQRVSSTYLAWYQQKPGQAPR...,SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWL...
9,9cph,H,L,A,EVQLVESGGGLVQPGGSLRLSCAASGFNLSSSSIHWVRQAPGKGLE...,AQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLL...,KIEEGKLVIWINGDKGYNGLAEVGKKFEKDTGIKVTVEHPDKLEEK...
10,9d7i,H,G,E,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...
11,9d7i,J,I,C,VQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLEW...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...
12,9d7o,H,G,E,QVQLQESGPGVVKSSETLSLTCTVSGGSMGGTYWSWLRLSPGKGLE...,YELTQPPSVSVSPGQTATITCSGASTNVCWYQVKPGQSPEVVIFEN...,LWVTVYYGVPVWKDAETTLFCASDNVWATHACVPTDPNPQEIHLEN...


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use 3090 Ti

def generate_antibody_sequence(
    antigen_sequence: str,
    model_path: str = "./phi35-antibody-corrected-multigpu",
    base_model_name: str = "microsoft/Phi-3.5-mini-instruct",
    max_new_tokens: int = 200,
    top_p: float = 0.9,
    temperature: float = 0.9,
    top_k: int = 50
) -> str:
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    
    # Load base model + PEFT adapter
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name, 
        torch_dtype=torch.float32,  # Use float32 for compatibility
        trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    model = model.cuda()
    
    # Create pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
    
    prompt = f"Antigen: {antigen_sequence}\nAntibody: "
    
    try:
        outputs = generator(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=top_p,
            temperature=temperature,
            top_k=top_k,
            pad_token_id=tokenizer.eos_token_id
        )
        return outputs[0]["generated_text"]
    except:
        # Fallback if pipeline fails
        inputs = tokenizer(prompt, return_tensors="pt").cuda()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_p=top_p,
                temperature=temperature,
                top_k=top_k,
                pad_token_id=tokenizer.eos_token_id
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test antigens
test_antigens = ["DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV"

]
    

# Generate antibodies
for antigen in test_antigens:
    antibody_sequence = generate_antibody_sequence(antigen)
    print(antibody_sequence)
    print("-" * 50)
    

 

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.82it/s]
Device set to use cuda:0
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


AttributeError: 

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

def generate_antibody_sequence(antigen_sequence: str) -> str:
    # Load model
    model_path = "./phi35-antibody-corrected-multigpu"
    base_model_name = "microsoft/Phi-3.5-mini-instruct"
    
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name, 
        torch_dtype=torch.float32,
        trust_remote_code=True
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    model = model.cuda()
    model.eval()
    
    # Generate step by step
    prompt = f"Antigen: {antigen_sequence}\nAntibody: "
    current_text = prompt
    generated_part = ""
    
    for _ in range(150):  # Max 150 tokens
        # Tokenize current text
        inputs = tokenizer(current_text, return_tensors="pt")
        inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Get next token
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0, -1, :]
            
            # Sample next token
            probs = torch.softmax(logits, dim=-1)
            next_token_id = torch.multinomial(probs, 1)[0]
            next_token = tokenizer.decode(next_token_id.item())
            
            # Stop conditions
            if next_token in [tokenizer.eos_token, "\n"] or next_token.strip() == "":
                break
                
            # Add to sequence
            current_text += next_token
            generated_part += next_token
            
            # Stop if we have a complete antibody
            if "|" in generated_part and len(generated_part) > 60:
                break
    
    return f"Antibody: {generated_part.strip()}"

# Test antigens
test_antigens = ["DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV"

] 

# Generate antibodies
for antigen in test_antigens:
    result = generate_antibody_sequence(antigen)
    print(result)

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.18it/s]
You are not running the flash-attention implementation, expect numerical differences.


Antibody: LNQLQSCGGHALPGTSVRISCRASHNDPYWGQGLEYIGTIWHPNDGEGGDMTRNSSGWQSYDRGNNEQVEAQKRLSISISYGQGTDETDKEGCYLKWSTIGIYNQYSGSSMCEFKNVFKNDESSSTSTSLESLRTVASSLSSLQLTSGGGTGAASAVSVNNAKQTRKHRISYVPHNSWGVXSADAASWWADVFMHVTVCCJGDMRDKVERNYCSSHRTSRCSCLHDCLSCGPNGESPM


In [1]:
def generate_antibody_sequence(antigen_sequence: str) -> str:
    import os
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from peft import PeftModel
    
    # Set environment to use both GPUs
    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
    
    # Load model
    model_path = "./phi35-antibody-corrected-multigpu"
    base_model_name = "microsoft/Phi-3.5-mini-instruct"
    
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    
    # Use the same device map as training
    custom_device_map = {
        'model.embed_tokens': 1,
        'model.layers.0': 1, 'model.layers.1': 1, 'model.layers.2': 1, 'model.layers.3': 1,
        'model.layers.4': 1, 'model.layers.5': 1, 'model.layers.6': 1, 'model.layers.7': 1,
        'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1,
        'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1,
        'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0,
        'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0,
        'model.layers.24': 0, 'model.layers.25': 0, 'model.layers.26': 0, 'model.layers.27': 0,
        'model.layers.28': 0, 'model.layers.29': 0, 'model.layers.30': 0, 'model.layers.31': 0,
        'model.norm': 1,
        'lm_head': 1,
    }
    
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name, 
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map=custom_device_map
    )
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()
    
    # Generate step by step
    prompt = f"Antigen: {antigen_sequence}\nAntibody: "
    current_text = prompt
    generated_part = ""
    
    for step in range(300):
        # Tokenize and put on GPU 1 (where embeddings are)
        inputs = tokenizer(current_text, return_tensors="pt")
        inputs = {k: v.cuda(1) for k, v in inputs.items()}  # Explicitly use GPU 1
        
        # Get next token
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0, -1, :]
            
            # Sample next token
            temperature = 0.8
            probs = torch.softmax(logits / temperature, dim=-1)
            next_token_id = torch.multinomial(probs, 1)[0]
            next_token = tokenizer.decode(next_token_id.item())
            
            # Stopping conditions
            if next_token in [tokenizer.eos_token, "<|endoftext|>"]:
                break
                
            if next_token == "\n":
                if "|" in generated_part and len(generated_part.split("|")) >= 2:
                    if len(generated_part.split("|")[1]) > 20:
                        break
            
            # Add to sequence
            current_text += next_token
            generated_part += next_token
            
            # Force separator if needed
            if step > 100 and "|" not in generated_part:
                sep_token_id = tokenizer.encode("|", add_special_tokens=False)[0]
                separator_prob = torch.softmax(logits, dim=-1)[sep_token_id].item()
                
                if separator_prob > 0.01:
                    current_text += "|"
                    generated_part += "|"
                    continue
            
            # Good stopping point
            if "|" in generated_part:
                parts = generated_part.split("|")
                if len(parts) >= 2 and len(parts[1]) > 50:
                    break
    
    # Clean up
    antibody_seq = generated_part.strip()
    
    if "|" not in antibody_seq:
        if len(antibody_seq) > 100:
            split_point = len(antibody_seq) // 2
            antibody_seq = antibody_seq[:split_point] + "|" + antibody_seq[split_point:]
        else:
            antibody_seq += "|LIGHTCHAIN"
    
    return f"Antibody: {antibody_seq}"

# Test antigens
test_antigens = ["DTICIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDSHNGKLCLLKGIAPLQLGNCSVAGWILGNPECELLISKESWSYIVETPNPENGTCYPGYFADYEELREQLSSVSSFERFEIFPKGSSWPNHTVTGVSASCSHNGKSSFYRNLLWLTGKNGLYPNLSMSYVNNKEKEVLVLWGVHHPPNIGDQRALYHTENAYVSVVSSHYSRRFTPEIAKRPKVRDQEGRINYYWTLLEPGDTIIFEANGNLIAPWYAFALSRGFGSGIITSNAPMDECDAKCQTPQGAINSSLPFQNVHPV"

] 

for antigen in test_antigens:
    result = generate_antibody_sequence(antigen)
    print(result)
    print("-" * 80)

 

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.11it/s]
You are not running the flash-attention implementation, expect numerical differences.


Antibody: qlLSCSSILQESGGGLAAGMGTSLTTDQSPYFDVWHIHWVKQARNQGPGQSPGLEWIGNIYCGDGSTNQVKASLRSEYLQWNSVTTAPSVKTITRGRFTSADSFSVTAALHYVCGYWGQGTSVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYEAPHYWGQGTLVTVPSSSLT|RISMASSSVSGVF
Antigen:DPTICIGYHANNSTDTVLATVTVSLHSLKDVLNGTYPVYPFRQQHAGSGTISSHGPGQCCPFSWVLLLQDQKLLLKGIAPLQLGVNFWLGIGWGNPECELLISKESWSYIVETPNPENGTCYPGYFAYEELREQLSSVSRFQHLTFESDEKQITTVYSVTVTKQKFRQPTLGGITPVVQTTIVTXIGCFTKAYVSVSSERFEIFPK
--------------------------------------------------------------------------------


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load model
model_path = "./phi35-antibody-lora-e10"
base_model_name = "microsoft/Phi-3.5-mini-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name, 
    torch_dtype=torch.float32,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, model_path)
model = model.cuda().eval()

# Test with exact training format
test_prompt = "Antigen: MKFL\nAntibody: "
inputs = tokenizer(test_prompt, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

print("Testing model predictions...")
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits[0, -1, :]
    
    # Get top 10 most likely tokens
    top_k = torch.topk(logits, 10)
    
    print(f"Input: '{test_prompt}'")
    print("Top 10 predictions:")
    for i, (score, token_id) in enumerate(zip(top_k.values, top_k.indices)):
        token = tokenizer.decode(token_id.item())
        print(f"  {i+1:2d}. '{token}' (score: {score.item():.2f})")

# Check if LoRA is actually enabled
print(f"\nLoRA adapters enabled: {model.active_adapters}")

# Test without LoRA
print("\nTesting base model (no LoRA):")
model.disable_adapter_layers()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits[0, -1, :]
    top_k = torch.topk(logits, 5)
    
    for i, (score, token_id) in enumerate(zip(top_k.values, top_k.indices)):
        token = tokenizer.decode(token_id.item())
        print(f"  {i+1}. '{token}' ({score.item():.2f})")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.64it/s]


Testing model predictions...
Input: 'Antigen: MKFL
Antibody: '
Top 10 predictions:
   1. '1' (score: 43.68)
   2. '2' (score: 42.73)
   3. '9' (score: 42.52)
   4. '4' (score: 42.20)
   5. '3' (score: 42.09)
   6. '5' (score: 42.06)
   7. '6' (score: 41.96)
   8. '7' (score: 41.66)
   9. '0' (score: 41.65)
  10. '8' (score: 41.62)

LoRA adapters enabled: ['default']

Testing base model (no LoRA):
  1. '1' (43.57)
  2. '2' (42.61)
  3. '9' (42.39)
  4. '4' (42.07)
  5. '3' (41.96)


In [5]:
# Check what your training data actually looked like
print("=== CHECKING TRAINING DATA ===")
sample_from_df = df.iloc[0]
formatted_sample = f"Antigen: {sample_from_df['AntigenSeq']}\nAntibody: {sample_from_df['HeavySeq']}|{sample_from_df['LightSeq']}\n"
print("Original training format:")
print(formatted_sample[:200])

# Check how it was tokenized
inputs = tokenizer(formatted_sample[:100], return_tensors="pt")
decoded = tokenizer.decode(inputs['input_ids'][0])
print(f"\nTokenized and decoded:")
print(decoded[:200])

# Check if amino acids are in vocabulary
print(f"\n=== AMINO ACID TOKENS ===")
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
for aa in amino_acids:
    token_id = tokenizer.encode(aa, add_special_tokens=False)
    print(f"{aa}: {token_id}")

# Check separator
sep_token = tokenizer.encode("|", add_special_tokens=False)
print(f"'|': {sep_token}")

=== CHECKING TRAINING DATA ===
Original training format:
Antigen: SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYVEDTWGSDGNPITKTTSDYQDSDVSQEVRKYFC|SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYVEDTWGSDGNPITKTTSDYQDSDVSQEVRKYFC
Antibody: QLQLQESGPGLVKPS

Tokenized and decoded:
Antigen: SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYVEDTWGSDGNPITKTTSDYQDSDVSQEVRKYFC|SCNGLYYQ

=== AMINO ACID TOKENS ===
A: [319]
C: [315]
D: [360]
E: [382]
F: [383]
G: [402]
H: [379]
I: [306]
K: [476]
L: [365]
M: [341]
N: [405]
P: [349]
Q: [660]
R: [390]
S: [317]
T: [323]
V: [478]
W: [399]
Y: [612]
'|': [891]


In [4]:
import pandas as pd
from datasets import Dataset

# Fix the training data format
def format_prompt_correct(example):
    return {
        "text": f"Antigen: {example['AntigenSeq']}\nAntibody: {example['HeavySeq']}|{example['LightSeq']}\n"
    }

print("=== FIXING TRAINING DATA ===")

# Check original data
print("Original data sample:")
sample = df.iloc[0]
print(f"AntigenSeq: {sample['AntigenSeq'][:50]}...")
print(f"HeavySeq: {sample['HeavySeq'][:50]}...")  
print(f"LightSeq: {sample['LightSeq'][:50]}...")

# Create corrected format
corrected_sample = format_prompt_correct(sample)
print(f"\nCorrected format:")
print(corrected_sample['text'][:200])

# Recreate dataset with correct format
print("\n=== RETRAINING WITH CORRECT FORMAT ===")
dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_prompt_correct)

def tokenize(example):
    encoded = tokenizer(
        example["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

tokenized_dataset = dataset.map(tokenize)
tokenized_dataset = tokenized_dataset.remove_columns([
    'pdb', 'Hchain', 'Lchain', 'AntigenSeq', 'AntigenChains',
    'HeavySeq', 'LightSeq', '__index_level_0__', 'text'
])

train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=1337)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Corrected dataset size: {len(train_dataset)} train, {len(eval_dataset)} eval")
print("Ready to retrain with correct format!")

=== FIXING TRAINING DATA ===
Original data sample:
AntigenSeq: SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYV...
HeavySeq: QLQLQESGPGLVKPSETLSLTCTVSGGSISSNNDYWGWIRQPPGKGLEWI...
LightSeq: EIVLTQSPGTLSLSPGERVTLSCRASQRVSSTYLAWYQQKPGQAPRLLIY...

Corrected format:
Antigen: SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYVEDTWGSDGNPITKTTSDYQDSDVSQEVRKYFC|SCNGLYYQGSCYILHSDYKSFEDAKANCAAESSTLPNKSDVLTTWLIDYVEDTWGSDGNPITKTTSDYQDSDVSQEVRKYFC
Antibody: QLQLQESGPGLVKPS

=== RETRAINING WITH CORRECT FORMAT ===


Map: 100%|██████████| 10073/10073 [00:00<00:00, 36494.14 examples/s]
Map: 100%|██████████| 10073/10073 [00:03<00:00, 3085.20 examples/s]

Corrected dataset size: 8058 train, 2015 eval
Ready to retrain with correct format!





In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Use both GPUs if available

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import gc

# Clear memory
torch.cuda.empty_cache()
gc.collect()

print("=== RETRAINING WITH CORRECT FORMAT ===")

# Load fresh model (don't use the incorrectly trained one)
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Add amino acid tokens
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
extra_tokens = amino_acids + ["|"]
new_tokens = [t for t in extra_tokens if t not in tokenizer.get_vocab()]
if new_tokens:
    tokenizer.add_tokens(new_tokens)

# Load fresh base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="auto",  # Automatically distribute across GPUs
).cuda()

# Resize embeddings
if new_tokens:
    model.resize_token_embeddings(len(tokenizer))

# PEFT configuration
peft_config = LoraConfig(
    r=16,  # Increased from 4
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["o_proj", "qkv_proj"],  # Target more modules
    inference_mode=False,
)

peft_model = get_peft_model(model, peft_config)
peft_model.train()

# Enable gradients for embeddings
for name, param in peft_model.named_parameters():
    if 'embed_tokens' in name or 'lm_head' in name:
        param.requires_grad = True

print("=== CORRECTED PEFT MODEL INFO ===")
peft_model.print_trainable_parameters()

# Fixed training arguments
training_args = TrainingArguments(
    output_dir="./phi35-antibody-corrected",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,  # Train for 5 epochs
    gradient_accumulation_steps=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="epoch",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    learning_rate=5e-5,
    bf16=True,
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
    report_to="none",
    remove_unused_columns=False,
    save_total_limit=1,
    load_best_model_at_end=True,
        # Multi-GPU specific settings
    dataloader_num_workers=2,  # More workers for multi-GPU
    ddp_find_unused_parameters=False,  # Optimization for LoRA
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

# Clean up memory
del model
torch.cuda.empty_cache()
gc.collect()

print("Starting corrected training...")
trainer.train()

# Save the corrected model
print("Saving corrected model...")
peft_model.save_pretrained("./phi35-antibody-corrected")
tokenizer.save_pretrained("./phi35-antibody-corrected")
print("✓ Corrected model saved!")

=== RETRAINING WITH CORRECT FORMAT ===


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


=== CORRECTED PEFT MODEL INFO ===
trainable params: 206,438,400 || all params: 3,830,516,736 || trainable%: 5.3893
Starting corrected training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyboardInterrupt: 

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Both GPUs visible
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import gc

print("=== CHECKING GPU COMPATIBILITY ===")
print(f"Available GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
# Test both GPUs
for i in range(torch.cuda.device_count()):
    try:
        test_tensor = torch.randn(10, 10).cuda(i)
        print(f"✓ GPU {i} working")
    except Exception as e:
        print(f"✗ GPU {i} failed: {e}")

# Since RTX 5090 has issues, let's use a custom device map that prioritizes the 3090 Ti
custom_device_map = {
    'model.embed_tokens': 1,  # 3090 Ti
    'model.layers.0': 1,      # Start layers on 3090 Ti
    'model.layers.1': 1,
    'model.layers.2': 1,
    'model.layers.3': 1,
    'model.layers.4': 1,
    'model.layers.5': 1,
    'model.layers.6': 1,
    'model.layers.7': 1,
    'model.layers.8': 1,
    'model.layers.9': 1,
    'model.layers.10': 1,
    'model.layers.11': 1,
    'model.layers.12': 1,
    'model.layers.13': 1,
    'model.layers.14': 1,
    'model.layers.15': 1,
    'model.layers.16': 0,     # Try some layers on 5090
    'model.layers.17': 0,
    'model.layers.18': 0,
    'model.layers.19': 0,
    'model.layers.20': 0,
    'model.layers.21': 0,
    'model.layers.22': 0,
    'model.layers.23': 0,
    'model.layers.24': 0,
    'model.layers.25': 0,
    'model.layers.26': 0,
    'model.layers.27': 0,
    'model.layers.28': 0,
    'model.layers.29': 0,
    'model.layers.30': 0,
    'model.layers.31': 0,
    'model.norm': 1,          # 3090 Ti
    'lm_head': 1,            # 3090 Ti
}

# Clear memory
torch.cuda.empty_cache()
gc.collect()

print("=== LOADING MODEL WITH CUSTOM DEVICE MAP ===")

# Load fresh model
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Add amino acid tokens
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
extra_tokens = amino_acids + ["|"]
new_tokens = [t for t in extra_tokens if t not in tokenizer.get_vocab()]
if new_tokens:
    tokenizer.add_tokens(new_tokens)

try:
    # Try custom device map
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        device_map=custom_device_map,
    )
    print("✓ Custom device map successful")
except Exception as e:
    print(f"Custom device map failed: {e}")
    print("Falling back to 3090 Ti only...")
    # Fallback to just 3090 Ti
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
    ).cuda()

# Resize embeddings
if new_tokens:
    model.resize_token_embeddings(len(tokenizer))

# Check where model parts are
print("Model device placement:")
for name, param in model.named_parameters():
    if any(x in name for x in ['embed_tokens', 'layers.0', 'layers.16', 'layers.31', 'lm_head']):
        print(f"{name}: {param.device}")
        break

# Continue with PEFT setup...
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["o_proj", "qkv_proj"],
    inference_mode=False,
)

peft_model = get_peft_model(model, peft_config)
peft_model.train()

# Continue with training...

=== CHECKING GPU COMPATIBILITY ===
Available GPUs: 2
GPU 0: NVIDIA GeForce RTX 5090
GPU 1: NVIDIA GeForce RTX 3090 Ti
✓ GPU 0 working
✓ GPU 1 working
=== LOADING MODEL WITH CUSTOM DEVICE MAP ===


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s]


✓ Custom device map successful
Model device placement:
model.embed_tokens.weight: cuda:1


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vecto

In [3]:
# Recreate the corrected dataset
print("=== RECREATING CORRECTED DATASET ===")

from datasets import Dataset

# Fix the training data format
def format_prompt_correct(example):
    return {
        "text": f"Antigen: {example['AntigenSeq']}\nAntibody: {example['HeavySeq']}|{example['LightSeq']}\n"
    }

# Create corrected dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_prompt_correct)

def tokenize(example):
    encoded = tokenizer(
        example["text"], 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )
    encoded["labels"] = encoded["input_ids"].copy()
    return encoded

tokenized_dataset = dataset.map(tokenize)
tokenized_dataset = tokenized_dataset.remove_columns([
    'pdb', 'Hchain', 'Lchain', 'AntigenSeq', 'AntigenChains',
    'HeavySeq', 'LightSeq', '__index_level_0__', 'text'
])

train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=1337)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Dataset recreated: {len(train_dataset)} train, {len(eval_dataset)} eval")

# Enable gradients for embeddings
for name, param in peft_model.named_parameters():
    if 'embed_tokens' in name or 'lm_head' in name:
        param.requires_grad = True

print("=== CORRECTED PEFT MODEL INFO ===")
peft_model.print_trainable_parameters()

# Check which devices the LoRA adapters are on
print("\nLoRA adapter device placement:")
for name, param in peft_model.named_parameters():
    if 'lora_A' in name or 'lora_B' in name:
        print(f"{name}: {param.device}")
        break

# Continue with the rest of your training code...

=== RECREATING CORRECTED DATASET ===


Map: 100%|██████████| 10073/10073 [00:00<00:00, 30377.79 examples/s]
Map: 100%|██████████| 10073/10073 [00:02<00:00, 4231.45 examples/s]

Dataset recreated: 8058 train, 2015 eval
=== CORRECTED PEFT MODEL INFO ===
trainable params: 206,438,400 || all params: 3,830,516,736 || trainable%: 5.3893

LoRA adapter device placement:
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: cuda:1





In [4]:
# Optimized training arguments for speed
training_args = TrainingArguments(
    output_dir="./phi35-antibody-corrected-multigpu",
    
    # INCREASE BATCH SIZES - you have tons of VRAM left
    per_device_train_batch_size=16,  # Increased from 6
    per_device_eval_batch_size=16,   # Increased from 6
    gradient_accumulation_steps=1,   # Reduced since batch size is larger
    
    # REDUCE EPOCHS - your dataset is small enough
    num_train_epochs=3,  # Reduced from 5
    
    # SPEED OPTIMIZATIONS
    max_steps=None,  # Let it run full epochs
    warmup_steps=25,  # Reduced from 50
    weight_decay=0.01,
    logging_steps=10,  # More frequent logging to monitor
    save_strategy="epoch",
    eval_strategy="no",  # Disable evaluation during training for speed
    learning_rate=1e-4,  # Slightly higher learning rate
    
    # MEMORY/SPEED OPTIMIZATIONS
    bf16=True,
    gradient_checkpointing=False,  # Disable to trade memory for speed
    dataloader_pin_memory=True,    # Enable for faster data loading
    dataloader_num_workers=4,      # More workers for faster data loading
    
    # OTHER OPTIMIZATIONS
    report_to="none",
    remove_unused_columns=False,
    save_total_limit=1,
    load_best_model_at_end=False,  # Disable for speed
    ddp_find_unused_parameters=False,
    
    # ADDITIONAL SPEED SETTINGS
    group_by_length=True,  # Group similar length sequences
    length_column_name="length",  # If you have length info
    optim="adamw_torch_fused",  # Faster optimizer
)

# Also increase LoRA rank since you have memory headroom
peft_config = LoraConfig(
    r=32,  # Increased from 16 - more capacity
    lora_alpha=64,  # Increased proportionally
    lora_dropout=0.05,  # Reduced dropout
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["o_proj", "qkv_proj", "gate_proj", "up_proj", "down_proj"],  # Target more modules
    inference_mode=False,
)

# Monitor training speed
import time
start_time = time.time()

print("=== OPTIMIZED TRAINING SETTINGS ===")
print(f"Effective batch size: {16 * torch.cuda.device_count() * 1} = {16 * torch.cuda.device_count()}")
print(f"Total steps per epoch: {len(train_dataset) // (16 * torch.cuda.device_count())}")
print("Starting fast training...")

trainer.train()

end_time = time.time()
print(f"Training completed in {(end_time - start_time)/60:.1f} minutes")

=== OPTIMIZED TRAINING SETTINGS ===
Effective batch size: 32 = 32
Total steps per epoch: 251
Starting fast training...


NameError: name 'trainer' is not defined

In [8]:
# Enable gradients for embeddings
for name, param in peft_model.named_parameters():
    if 'embed_tokens' in name or 'lm_head' in name:
        param.requires_grad = True

print("=== CORRECTED PEFT MODEL INFO ===")
peft_model.print_trainable_parameters()

# Check which devices the LoRA adapters are on
print("\nLoRA adapter device placement:")
for name, param in peft_model.named_parameters():
    if 'lora_A' in name or 'lora_B' in name:
        print(f"{name}: {param.device}")
        break

# Optimized training arguments for speed
training_args = TrainingArguments(
    output_dir="./phi35-antibody-corrected-multigpu",
    
    # INCREASE BATCH SIZES - you have tons of VRAM left
    per_device_train_batch_size=26,  # Increased from 6
    per_device_eval_batch_size=26,   # Increased from 6
    gradient_accumulation_steps=1,   # Reduced since batch size is larger
    
    # REDUCE EPOCHS - your dataset is small enough
    num_train_epochs=3,  # Reduced from 5
    
    # SPEED OPTIMIZATIONS
 
    warmup_steps=25,  # Reduced from 50
    weight_decay=0.01,
    logging_steps=10,  # More frequent logging to monitor
    save_strategy="epoch",
    eval_strategy="no",  # Disable evaluation during training for speed
    learning_rate=1e-4,  # Slightly higher learning rate
    
    # MEMORY/SPEED OPTIMIZATIONS
    bf16=True,
    gradient_checkpointing=False,  # Disable to trade memory for speed
    dataloader_pin_memory=True,    # Enable for faster data loading
    dataloader_num_workers=4,      # More workers for faster data loading
    
    # OTHER OPTIMIZATIONS
    report_to="none",
    remove_unused_columns=False,
    save_total_limit=1,
    load_best_model_at_end=False,  # Disable for speed
    ddp_find_unused_parameters=False,
    
    # ADDITIONAL SPEED SETTINGS
    group_by_length=True,  # Group similar length sequences
    length_column_name="length",  # If you have length info
    optim="adamw_torch_fused",  # Faster optimizer
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

# Monitor GPU usage during training
def print_gpu_usage():
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1e9
        reserved = torch.cuda.memory_reserved(i) / 1e9
        total = torch.cuda.get_device_properties(i).total_memory / 1e9
        print(f"GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total")

print("\nGPU usage before training:")
print_gpu_usage()

# Clean up memory
torch.cuda.empty_cache()
gc.collect()

print("\n=== STARTING CORRECTED MULTI-GPU TRAINING ===")
trainer.train()

print("\nGPU usage after training:")
print_gpu_usage()

# Save the corrected model
print("Saving corrected model...")
peft_model.save_pretrained("./phi35-antibody-corrected-multigpu")
tokenizer.save_pretrained("./phi35-antibody-corrected-multigpu")
print("✓ Corrected model saved!")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


=== CORRECTED PEFT MODEL INFO ===
trainable params: 206,438,400 || all params: 3,830,516,736 || trainable%: 5.3893

LoRA adapter device placement:
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: cuda:1

GPU usage before training:
GPU 0: 3.7GB allocated, 10.6GB reserved, 33.7GB total
GPU 1: 5.3GB allocated, 11.5GB reserved, 25.3GB total

=== STARTING CORRECTED MULTI-GPU TRAINING ===


Step,Training Loss
10,2.8781
20,3.0647
30,2.8408
40,2.9278
50,2.8166
60,2.8506
70,2.6781
80,2.507
90,2.6619
100,2.7103



GPU usage after training:
GPU 0: 3.7GB allocated, 15.1GB reserved, 33.7GB total
GPU 1: 5.7GB allocated, 16.6GB reserved, 25.3GB total
Saving corrected model...
✓ Corrected model saved!
