
# BigProtein-Qwen2.5 — Step‑by‑Step Test Notebook (Colab)
This notebook lets you **test each component** of the protein‑conditioned Qwen2.5 pipeline *before* running full training.  
It mirrors the main script logic, but runs **function‑by‑function** so you can see errors early with clear tracebacks.

> **Files expected in the working directory** (upload or mount a folder containing them):  
> - `bigmodel_joint_train.py`  
> - `protein_encoder.py`  
> - `structure_encoder.py`


In [1]:
#@title Mount Google Drive
from pathlib import Path
from huggingface_hub import snapshot_download
import os, json, pickle, pandas as pd
from tqdm import tqdm
from rich import print as rprint

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

WORKDIR = Path("/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines")
%cd {WORKDIR}

from pathlib import Path
BASE_DIR = Path("/content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT")
OUT_DIR  = BASE_DIR / "sft_test_demo"
print(f"Using Google Drive folder as BASE_DIR: {BASE_DIR}")


Mounted at /content/drive
/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
Using Google Drive folder as BASE_DIR: /content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT



## 0) Runtime & Installs
If you're on Google Colab, run this cell to install dependencies.


In [3]:
# # Check GPU
# !nvidia-smi

# #@title Install dependencies (Torch + requirements_offload)
# import subprocess, sys, os, json, textwrap
# from pathlib import Path

# REQ = Path("requirements_offload.txt")
# if not REQ.exists():
#     print("requirements_offload.txt not found here. Listing directory:")
#     print(os.listdir("."))

# # Fresh pip + libs (PyTorch CUDA 12.1 build + matching libs)
# %pip -q install --upgrade pip
# %pip install -q --index-url https://download.pytorch.org/whl/cu126 \
#   torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0
# %pip -q install transformers==4.56.1 huggingface_hub==0.35.0 tqdm safetensors

# # Core deps
# !pip -q install -r requirements_offload.txt

# import torch, transformers, accelerate, huggingface_hub, tqdm as _tqdm
# print("torch:", torch.__version__)
# print("transformers:", transformers.__version__)
# print("accelerate:", accelerate.__version__)
# print("huggingface_hub:", huggingface_hub.__version__)

In [4]:
# If you already pinned these in the runtime, you can skip reinstalling.
!pip -q install --upgrade pip
!pip -q install transformers==4.56.1 huggingface_hub==0.35.0 accelerate>=0.33.0 safetensors tqdm bitsandbytes

import os, torch, subprocess, json, math, re, time, sys
from pathlib import Path

# Helpful for CUDA fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512,expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Show GPU
!nvidia-smi || true
print("torch:", torch.__version__)

Thu Oct  2 13:41:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   32C    P0             51W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

torch            : 2.8.0+cu126

transformers     : 4.56.1

huggingface_hub  : 0.35.0

✅ Top-level EsmForMaskedLM import OK


## 1) Loading Encoder Checkpoints

In [5]:

# === LLM & Encoders ===
MODEL_NAME         = "Qwen/Qwen2.5-0.5B-Instruct"   # Small-ish for Colab testing
PROTEIN_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D"
STRUCTURE_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M"
PROTREK_CKPT    = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt"
PROJECT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines"
TRAIN_FILE = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl"
VAL_FILE   = None  # or path
OUT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test"

#%cd /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
SCRIPT = WORKDIR / "train_prefix_qwen_fsdp_offload1.py"

for p in [PROJECT_DIR, TRAIN_FILE, PROTEIN_CONFIG, STRUCTURE_CONFIG, PROTREK_CKPT, OUT_DIR]:
    print("✓ exists:", os.path.exists(p), p)



✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test


In [6]:
SAVE_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/colab_runs/qwen05b_prefix_test"
PROT_SLOT, STRU_SLOT = 1, 3
# Training knobs (friendly defaults for single-GPU offload on Colab)
BATCH_SIZE   = 1
ACCUM_STEPS  = 8
MAX_LEN      = 1024
PREFIX_LEN   = 4
PREFIX_GATE  = 1.0
EPOCHS       = 1
LR           = 1e-4
OPTIMIZER    = "adamw"   # "adamw" | "adam8bit" | "adafactor"
TRAIN_ENCODS = True         # backprop encoders (slower; set False to freeze)
FREEZE_LLM   = False

SAVE_DIR     = "/content/checkpoints/qwen7b_bf16_offload"
SAVE_EVERY   = 0            # 0 = only save final
EVAL_EVERY   = 0            # 0 = no mid-epoch eval

# Check files exist
for p in [SCRIPT, TRAIN_FILE, PROTEIN_CONFIG, STRUCTURE_CONFIG, PROTREK_CKPT]:
    print("OK:", p) if Path(p).exists() else print("MISSING:", p)

OK: /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/train_prefix_qwen_fsdp_offload1.py
OK: /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl
OK: /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D
OK: /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M
OK: /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt


In [7]:
# SUBTRAIN_FILE = os.path.join(PROJECT_DIR, "train_subset_100.jsonl")

# # Write first 1000 non-empty lines to subset
# count = 0
# with open(TRAIN_FILE, "r", encoding="utf-8") as fin, open(SUBTRAIN_FILE, "w", encoding="utf-8") as fout:
#     for line in fin:
#         if not line.strip():
#             continue
#         fout.write(line)
#         count += 1
#         if count >= 100:
#             break

# print("Wrote subset lines:", count, "->", SUBTRAIN_FILE)

In [8]:
import json
from statistics import mean

N = 0
prompt_chars = []
resp_chars   = []
with open(TRAIN_FILE, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        ex = json.loads(line)
        if "prompt" not in ex or "response" not in ex:
            continue
        N += 1
        prompt_chars.append(len(ex["prompt"]))
        resp_chars.append(len(ex["response"]))

def pct(x, p):
    i = max(0, min(len(x)-1, int(p*len(x))))
    return sorted(x)[i]

print(f"Samples: {N}")
if N:
    print(f"Prompt chars — mean={mean(prompt_chars):.0f}, min={min(prompt_chars)}, p50={pct(prompt_chars,0.5)}, p90={pct(prompt_chars,0.9)}, p95={pct(prompt_chars,0.95)}, max={max(prompt_chars)}")
    print(f"Response chars — mean={mean(resp_chars):.0f}, min={min(resp_chars)}, p50={pct(resp_chars,0.5)}, p90={pct(resp_chars,0.9)}, p95={pct(resp_chars,0.95)}, max={max(resp_chars)}")
    approx_tokens = sum((p+r)/4 for p,r in zip(prompt_chars, resp_chars))
    print(f"~Estimated tokens/epoch (capless, 4 chars/token): {int(approx_tokens):,}")

Samples: 7782
Prompt chars — mean=337, min=337, p50=337, p90=337, p95=337, max=337
Response chars — mean=1530, min=601, p50=1453, p90=1992, p95=2282, max=10615
~Estimated tokens/epoch (capless, 4 chars/token): 3,632,726


In [9]:
ACCEL_CFG = WORKDIR / "accelerate_cpu_offload_bf16.yaml"
ACCEL_CFG.write_text("""\
compute_environment: LOCAL_MACHINE
distributed_type: FSDP
mixed_precision: bf16
num_processes: 1
downcast_bf16: false
fsdp_config:
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_backward_prefetch_policy: BACKWARD_POST
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_use_orig_params: true
  fsdp_offload_params: true
  fsdp_cpu_offload: true
""")
print("Wrote:", ACCEL_CFG)

Wrote: /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/accelerate_cpu_offload_bf16.yaml


In [10]:
import accelerate, sys
print("accelerate:", accelerate.__version__)

accelerate: 1.10.1


In [11]:
import subprocess, sys, math, time, re
from tqdm import tqdm

# Compute approx optimizer-steps per epoch for progress (world_size=1 on Colab)
steps_per_rank = math.ceil(N / max(1, BATCH_SIZE))
opt_steps_per_epoch = math.ceil(steps_per_rank / max(1, ACCUM_STEPS)) * max(1, EPOCHS)
print(f"~Optimizer steps/epoch: ≈{opt_steps_per_epoch} (B={BATCH_SIZE}, accum={ACCUM_STEPS}, N={N})")

cmd = [
    "accelerate", "launch", "--config_file", str(ACCEL_CFG),
    str(SCRIPT),
    "--train-file", TRAIN_FILE,
    "--model-name", MODEL_NAME,
    "--protein-config", PROTEIN_CONFIG,
    "--structure-config", STRUCTURE_CONFIG,
    "--protrek-ckpt", PROTREK_CKPT,
    "--prot-slot", str(PROT_SLOT), "--stru-slot", str(STRU_SLOT),
    "--batch-size", str(BATCH_SIZE),
    "--accum-steps", str(ACCUM_STEPS),
    "--max-len", str(MAX_LEN),
    "--prefix-len", str(PREFIX_LEN),
    "--prefix-gate", str(PREFIX_GATE),
    "--epochs", str(EPOCHS),
    "--lr", str(LR),
    "--optimizer", OPTIMIZER,
    "--save-dir", SAVE_DIR,
    "--save-every", str(SAVE_EVERY),
]

if TRAIN_ENCODS:
    cmd.append("--train-encoders")
if FREEZE_LLM:
    cmd.append("--freeze-llm")
if VAL_FILE:
    cmd += ["--val-file", VAL_FILE]
if EVAL_EVERY:
    cmd += ["--eval-every", str(EVAL_EVERY)]

print("Launching:\n", " ".join(cmd))

# Run and live-parse "step ..." lines for progress
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=1, text=True)

step_re = re.compile(r"step\s+(\d+)\s*\|")  # matches: step 50 | loss=...
pbar = tqdm(total=opt_steps_per_epoch, desc="optimizer steps", dynamic_ncols=True)
last_step_seen = 0
start = time.time()

try:
    for line in proc.stdout:
        # Show raw logs
        sys.stdout.write(line)
        sys.stdout.flush()
        # Try to parse "step N"
        m = step_re.search(line)
        if m:
            s = int(m.group(1))
            if s > last_step_seen:
                pbar.update(s - last_step_seen)
                last_step_seen = s
except KeyboardInterrupt:
    proc.terminate()
finally:
    proc.wait()
    # If no step logs were printed (e.g., very small run), mark as done
    if last_step_seen < opt_steps_per_epoch:
        pbar.update(opt_steps_per_epoch - last_step_seen)
    pbar.close()

print(f"Done. Elapsed ~{int(time.time()-start)}s")

~Optimizer steps/epoch: ≈973 (B=1, accum=8, N=7782)
Launching:
 accelerate launch --config_file /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/accelerate_cpu_offload_bf16.yaml /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/train_prefix_qwen_fsdp_offload1.py --train-file /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl --model-name Qwen/Qwen2.5-0.5B-Instruct --protein-config /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D --structure-config /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M --protrek-ckpt /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt --prot-slot 1 --stru-slot 3 --batch-size 1 --accum-steps 8 --max-len 1024 --prefix-len 4 --prefix-gate 1.0 --epochs 1 --lr 0.0001 --optimizer adam8bit --save-dir /content/checkpoints/qwen7b_bf16_offload --save-every 0 --train-encoders


optimizer steps:   0%|          | 0/973 [00:00<?, ?it/s]

2025-10-02 13:41:43.790264: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-02 13:41:43.809257: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759412503.831102   21964 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759412503.838160   21964 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759412503.855389   21964 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

optimizer steps: 100%|██████████| 973/973 [00:58<00:00, 16.66it/s]

Done. Elapsed ~58s





In [12]:
import os, glob
print("Checkpoints in:", SAVE_DIR)
for p in sorted(glob.glob(os.path.join(SAVE_DIR, "*.pt"))):
    print(" -", p)

Checkpoints in: /content/checkpoints/qwen7b_bf16_offload
