
# BigProtein-Qwen2.5 — Step‑by‑Step Test Notebook (Colab)
This notebook lets you **test each component** of the protein‑conditioned Qwen2.5 pipeline *before* running full training.  
It mirrors the main script logic, but runs **function‑by‑function** so you can see errors early with clear tracebacks.

> **Files expected in the working directory** (upload or mount a folder containing them):  
> - `bigmodel_joint_train.py`  
> - `protein_encoder.py`  
> - `structure_encoder.py`


In [1]:
#@title Mount Google Drive
from pathlib import Path
from huggingface_hub import snapshot_download
import os, json, pickle, pandas as pd
from tqdm import tqdm
from rich import print as rprint

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines

from pathlib import Path
BASE_DIR = Path("/content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT")
OUT_DIR  = BASE_DIR / "sft_test_demo"
print(f"Using Google Drive folder as BASE_DIR: {BASE_DIR}")


Mounted at /content/drive
/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
Using Google Drive folder as BASE_DIR: /content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT



## 0) Runtime & Installs
If you're on Google Colab, run this cell to install dependencies.


In [3]:
# Check GPU
!nvidia-smi

# Fresh pip + libs (PyTorch CUDA 12.1 build + matching libs)
%pip -q install --upgrade pip
%pip install -q --index-url https://download.pytorch.org/whl/cu126 \
  torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0
%pip -q install transformers==4.56.1 huggingface_hub==0.35.0 tqdm safetensors

Thu Oct  2 15:05:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   31C    P0             49W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
# --- Version & import sanity checks ---
import torch, transformers, huggingface_hub
print("torch            :", torch.__version__)
print("transformers     :", transformers.__version__)
print("huggingface_hub  :", huggingface_hub.__version__)

# Top-level ESM import should work on 4.56.1
try:
    from transformers import AutoTokenizer, EsmForMaskedLM
    print("✅ Top-level EsmForMaskedLM import OK")
except Exception as e:
    print("❌ Top-level EsmForMaskedLM import failed:", repr(e))
    # Fallback check (direct module path)
    try:
        from transformers.models.esm.modeling_esm import EsmForMaskedLM as _E
        print("✅ Direct modeling_esm import OK (fallback)")
    except Exception as ee:
        print("❌ Direct modeling_esm import failed too:", repr(ee))

torch            : 2.8.0+cu126
transformers     : 4.56.1
huggingface_hub  : 0.35.0
✅ Top-level EsmForMaskedLM import OK


torch            : 2.8.0+cu126

transformers     : 4.56.1

huggingface_hub  : 0.35.0

✅ Top-level EsmForMaskedLM import OK


## 1) Loading Encoder Checkpoints

In [5]:

# === LLM & Encoders ===
MODEL_NAME         = "Qwen/Qwen2.5-3B-Instruct"   # Small-ish for Colab testing
PROTEIN_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D"
STRUCTURE_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M"
PROTREK_CKPT    = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt"
PROJECT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines"
DATA_JSONL = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl"
OUT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test"

for p in [PROJECT_DIR, DATA_JSONL, PROTEIN_CONFIG, STRUCTURE_CONFIG, PROTREK_CKPT, OUT_DIR]:
    print("✓ exists:", os.path.exists(p), p)



✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test


In [6]:
# === Prefix/Proj ===
SINGLE_TOKEN_PREFIX = False     # True -> 1 token; False -> soft prefix of length PREFIX_LEN
PREFIX_LEN          = 4
PROJ_HID            = 1024
DROPOUT             = 0.10

# === Training toggles ===
USE_LORA            = False
TRAIN_ENCODERS      = False    # True = end-to-end; False = freeze encoders
FREEZE_PROTEIN      = False    # only used if TRAIN_ENCODERS=True
FREEZE_STRUCTURE    = False    # only used if TRAIN_ENCODERS=True
GRAD_CHECKPOINT     = False

# === Misc ===
DEVICE              = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN             = 512
BSZ                 = 2
ACCUM               = 1
LR                  = 5e-5
WARMUP_RATIO        = 0.03
EPOCHS              = 1
OUTPUT_DIR          = "runs/colab_smoketest"
LOG_EVERY           = 1

print("Device:", DEVICE)

Device: cuda


In [None]:
# SUBSET_JSONL = os.path.join(PROJECT_DIR, "train_subset_100.jsonl")

# # Write first 1000 non-empty lines to subset
# count = 0
# with open(DATA_JSONL, "r", encoding="utf-8") as fin, open(SUBSET_JSONL, "w", encoding="utf-8") as fout:
#     for line in fin:
#         if not line.strip():
#             continue
#         fout.write(line)
#         count += 1
#         if count >= 100:
#             break

# print("Wrote subset lines:", count, "->", SUBSET_JSONL)

In [None]:
from train_prefix_qwen import train, parse_args
SUBSET_JSONL = os.path.join(PROJECT_DIR, "train_subset_100.jsonl")

In [None]:
import types, os

SAVE_DIR = os.path.join(PROJECT_DIR, "runs_colab_test")

args = types.SimpleNamespace(
    # Data
    train_file   = DATA_JSONL,
    val_file     = None,
    batch_size   = 1,         # adjust if you want
    accum_steps  = 2,
    max_len      = 1024,       # keep modest for speed
    # Model
    model_name   = MODEL_NAME,
    dtype        = "fp32",    # or "bf16" on A100 for speed
    prefix_len   = 4,         # try 1 or 4+
    prefix_gate  = 0.5,       # stabilizer on the soft prefix
    learnable_gate = False,
    freeze_llm   = False,     # True = projector-only
    train_encoders = True,   # keep ESM encoders frozen for speed
    # Encoders
    protein_config = PROTEIN_CONFIG,
    structure_config = STRUCTURE_CONFIG,
    protrek_ckpt  = PROTREK_CKPT,
    prot_slot     = 1,
    stru_slot     = 3,
    # Optim
    epochs      = 3,
    lr          = 3e-5,       # projector+LLM small LR
    weight_decay= 0.05,
    # Save/eval
    save_dir    = OUT_DIR,
    save_every  = 1500,
    eval_every  = 0,
    # Misc
    seed        = 42,
)

# Kick off training
train(args)


# 🔧 FSDP Test Add-on — Import & Unit-Test `train_prefix_qwen_fsdp.py`

This section adds **unit tests** for the multi‑GPU training script so you can see **clear tracebacks** directly in Colab, cell-by-cell.
It **does not** spawn multi-process; it runs pieces in‑notebook to validate shapes, masking, and forward loss.
When you're ready for multi‑GPU, use `accelerate launch` externally.



## 0) Install/Check Dependencies
Run this if your Colab doesn't already have the right versions.


In [7]:

# If needed, upgrade pip
# %pip -q install --upgrade pip

# Core deps (Torch build here is CUDA 12.1; adjust per your Colab runtime/GPU)
%pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip -q install transformers>=4.43.0 peft accelerate datasets tqdm



## 1) Ensure files are present & on `sys.path`
Expected files in the current working directory:
- `protein_encoder.py`
- `structure_encoder.py`
- `train_prefix_qwen_fsdp.py`  ← new multi-GPU ready trainer


In [8]:
os.getcwd()

'/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines'

In [9]:

import os, sys, glob, textwrap

need = ["protein_encoder.py", "structure_encoder.py", "train_prefix_qwen_fsdp.py"]
missing = [p for p in need if not os.path.exists(p)]
print("CWD:", os.getcwd())
print("Missing files:", missing)

# Add CWD to path for `from train_prefix_qwen_fsdp import ...`
if os.getcwd() not in sys.path:
    sys.path.insert(0, os.getcwd())

if missing:
    print("\n⚠️ Some files are missing. Upload them via the Colab file browser or mount Google Drive and %cd there.")
else:
    print("✅ All required files found.")


CWD: /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
Missing files: []
✅ All required files found.



## 2) Import from `train_prefix_qwen_fsdp.py`
We import classes/functions so we can call them directly and see tracebacks.


In [10]:

from train_prefix_qwen_fsdp import (
    BigProteinQwen,
    CollateCfg,
    PadAndMaskCollator,
    JsonlStream,
    train as fsdp_train,
)

print("Imported:", BigProteinQwen, CollateCfg, PadAndMaskCollator, JsonlStream, fsdp_train)


Imported: <class 'train_prefix_qwen_fsdp.BigProteinQwen'> <class 'train_prefix_qwen_fsdp.CollateCfg'> <class 'train_prefix_qwen_fsdp.PadAndMaskCollator'> <class 'train_prefix_qwen_fsdp.JsonlStream'> <function train at 0x7c18ffd01260>



## 3) Quick Configs (small models for unit tests)
Use a tiny Qwen and small ESM so everything runs fast on Colab.


In [11]:

MODEL_NAME       = "Qwen/Qwen2.5-0.5B-Instruct"     # small LLM for smoke tests
PROTEIN_CONFIG   = "facebook/esm2_t12_35M_UR50D"    # small ESM
STRUCTURE_CONFIG = "facebook/esm2_t12_35M_UR50D"

SINGLE_TOKEN_PREFIX = False    # switch to True to test single-token prefix
PREFIX_LEN          = 4
PROJ_HID            = 1024
DROPOUT             = 0.10

MAX_LEN    = 256
BATCH_SIZE = 2

print("Configs ready.")

PROTREK_CKPT   = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt"
PROT_SLOT = 1
STRU_SLOT = 3


Configs ready.



## 4) Create Tiny JSONL Data (seq-only and seq+structure)


In [17]:

import json, os
os.makedirs("sft_data", exist_ok=True)

toy = [
    {
        "prompt": "Describe the likely function of this protein.",
        "response": "This appears to be an enzyme with possible hydrolase activity.",
        "aa_seq": "MKTFFVAIATGAFSATA",
        "stru_str": "ACDEFGHIKLMNPQRSTVWY"
    },
    {
        "prompt": "What domain might this protein contain?",
        "response": "Likely contains a Rossmann-like fold domain.",
        "aa_seq": "MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAP",
        "stru_str": "ACDEFGHIKLMNPQRSTVWY"
    }
]

with open("sft_data/train_tiny.jsonl", "w", encoding="utf-8") as f:
    for ex in toy:
        f.write(json.dumps(ex) + "\n")

with open("sft_data/val_tiny.jsonl", "w", encoding="utf-8") as f:
    for ex in toy:
        f.write(json.dumps(ex) + "\n")

print("Wrote sft_data/train_tiny.jsonl & val_tiny.jsonl")


Wrote sft_data/train_tiny.jsonl & val_tiny.jsonl



## 5) Collator sanity check (masking & shapes)


In [18]:

from transformers import AutoTokenizer
import torch

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

collate = PadAndMaskCollator(CollateCfg(tokenizer=tok, max_len=MAX_LEN))

# Load two rows manually (not streaming) to see clear errors if any
rows = []
with open("sft_data/train_tiny.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 2: break
        rows.append(json.loads(line))

batch = collate(rows)
for k, v in batch.items():
    if isinstance(v, torch.Tensor):
        print(k, tuple(v.shape), v.dtype)
    else:
        if isinstance(v, list):
            print(k, type(v), len(v))
        else:
            print(k, type(v))

print("Sample labels (first 40):", batch["labels"][0][:40].tolist())


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids (2, 24) torch.int64
attention_mask (2, 24) torch.int64
labels (2, 24) torch.int64
aa_seq <class 'list'> 2
stru_str <class 'list'> 2
Sample labels (first 40): [-100, -100, -100, -100, -100, -100, -100, -100, -100, 1986, 7952, 311, 387, 458, 48142, 448, 3204, 6275, 67, 1080, 519, 5702, 13, 151645]



## 6) Model forward pass (no FSDP wrapping here; pure unit test)


In [19]:

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
big = BigProteinQwen(
    model_name=MODEL_NAME,
    protein_config=PROTEIN_CONFIG,
    structure_config=STRUCTURE_CONFIG,
    protrek_ckpt=PROTREK_CKPT,
    prot_slot=PROT_SLOT,
    stru_slot=STRU_SLOT,
    single_token_prefix=SINGLE_TOKEN_PREFIX,
    prefix_len=PREFIX_LEN,
    proj_hid=PROJ_HID,
    dropout=DROPOUT,
    train_encoders=True,      # freeze encoders in quick test
).to(device)

for k in ("input_ids","attention_mask","labels"):
    batch[k] = batch[k].to(device)

with torch.no_grad():
    out = big(**batch)
print("Forward OK. loss:", float(out.loss.detach().cpu()))


RuntimeError: mat1 and mat2 must have the same dtype, but got BFloat16 and Float


## 7) One optimizer step (unit test)


In [None]:

import torch
params = [p for p in big.parameters() if p.requires_grad]
opt = torch.optim.AdamW(params, lr=5e-5)

out = big(**batch)
loss = out.loss
loss.backward()
torch.nn.utils.clip_grad_norm_(params, 1.0)
opt.step(); opt.zero_grad(set_to_none=True)
print("1 step OK. loss:", float(loss.detach().cpu()))



## 8) End-to-end smoke test of `train()` on tiny data (single process)
This calls `train()` directly (so you can see tracebacks in‑notebook). It uses Accelerate but **won't** spawn multi‑process here.
For real multi‑GPU runs, use `accelerate launch` outside the notebook.


In [None]:

from argparse import Namespace

args = Namespace(
    train_file="sft_data/train_tiny.jsonl",
    val_file="sft_data/val_tiny.jsonl",
    max_len=MAX_LEN,
    batch_size=1,
    accum_steps=2,
    model_name=MODEL_NAME,
    protein_config=PROTEIN_CONFIG,
    structure_config=STRUCTURE_CONFIG,
    single_token_prefix=SINGLE_TOKEN_PREFIX,
    prefix_len=PREFIX_LEN,
    proj_hid=PROJ_HID,
    dropout=DROPOUT,
    protrek_ckpt=PROTREK_CKPT,
    prot_slot=PROT_SLOT,
    stru_slot=STRU_SLOT,
    train_encoders=False,
    epochs=1,
    lr=5e-5,
    warmup_ratio=0.03,
    weight_decay=0.05,
    save_dir="./runs_colab_fsdp_smoke",
    save_every=0,
    log_every=1,
    seed=42,
    dtype="bf16",
)

# This will run a tiny epoch in the notebook so you can see errors inline.
# It's a quick sanity check before using accelerate launch with multiple GPUs.
fsdp_train(args)
print("train() smoke test complete.")
