
# BigProtein-Qwen2.5 — Step‑by‑Step Test Notebook (Colab)
This notebook lets you **test each component** of the protein‑conditioned Qwen2.5 pipeline *before* running full training.  
It mirrors the main script logic, but runs **function‑by‑function** so you can see errors early with clear tracebacks.

> **Files expected in the working directory** (upload or mount a folder containing them):  
> - `bigmodel_joint_train.py`  
> - `protein_encoder.py`  
> - `structure_encoder.py`


In [1]:
#@title Mount Google Drive
from pathlib import Path
from huggingface_hub import snapshot_download
import os, json, pickle, pandas as pd
from tqdm import tqdm
from rich import print as rprint

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines

from pathlib import Path
BASE_DIR = Path("/content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT")
OUT_DIR  = BASE_DIR / "sft_test_demo"
print(f"Using Google Drive folder as BASE_DIR: {BASE_DIR}")


Mounted at /content/drive
/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
Using Google Drive folder as BASE_DIR: /content/drive/MyDrive/LLM/Bioreasoner/data/hf/proteinDT



## 0) Runtime & Installs
If you're on Google Colab, run this cell to install dependencies.


In [4]:
# Check GPU
!nvidia-smi

# Fresh pip + libs (PyTorch CUDA 12.1 build + matching libs)
%pip -q install --upgrade pip
%pip install -q --index-url https://download.pytorch.org/whl/cu126 \
  torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0
%pip -q install transformers==4.56.1 huggingface_hub==0.35.0 tqdm safetensors

Thu Oct  2 14:44:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   31C    P0             52W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
# --- Version & import sanity checks ---
import torch, transformers, huggingface_hub
print("torch            :", torch.__version__)
print("transformers     :", transformers.__version__)
print("huggingface_hub  :", huggingface_hub.__version__)

# Top-level ESM import should work on 4.56.1
try:
    from transformers import AutoTokenizer, EsmForMaskedLM
    print("✅ Top-level EsmForMaskedLM import OK")
except Exception as e:
    print("❌ Top-level EsmForMaskedLM import failed:", repr(e))
    # Fallback check (direct module path)
    try:
        from transformers.models.esm.modeling_esm import EsmForMaskedLM as _E
        print("✅ Direct modeling_esm import OK (fallback)")
    except Exception as ee:
        print("❌ Direct modeling_esm import failed too:", repr(ee))

torch            : 2.8.0+cu126
transformers     : 4.56.1
huggingface_hub  : 0.35.0
✅ Top-level EsmForMaskedLM import OK


torch            : 2.8.0+cu126

transformers     : 4.56.1

huggingface_hub  : 0.35.0

✅ Top-level EsmForMaskedLM import OK


## 1) Loading Encoder Checkpoints

In [6]:

# === LLM & Encoders ===
MODEL_NAME         = "Qwen/Qwen2.5-0.5B-Instruct"   # Small-ish for Colab testing
PROTEIN_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D"
STRUCTURE_CONFIG = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M"
PROTREK_CKPT    = "/content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt"
PROJECT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines"
DATA_JSONL = "/content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl"
OUT_DIR = "/content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test"

for p in [PROJECT_DIR, DATA_JSONL, PROTEIN_CONFIG, STRUCTURE_CONFIG, PROTREK_CKPT, OUT_DIR]:
    print("✓ exists:", os.path.exists(p), p)



✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_pipelines/protein2desc_sft_ALLFOUR_c000-009_fullcot.jsonl
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/esm2_t12_35M_UR50D
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/foldseek_t12_35M
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/protrek/weights/ProTrek_35M/ProTrek_35M.pt
✓ exists: True /content/drive/MyDrive/LLM/Bioreasoner/testing_notebooks/runs_colab_test


In [7]:
# === Prefix/Proj ===
SINGLE_TOKEN_PREFIX = False     # True -> 1 token; False -> soft prefix of length PREFIX_LEN
PREFIX_LEN          = 4
PROJ_HID            = 1024
DROPOUT             = 0.10

# === Training toggles ===
USE_LORA            = False
TRAIN_ENCODERS      = False    # True = end-to-end; False = freeze encoders
FREEZE_PROTEIN      = False    # only used if TRAIN_ENCODERS=True
FREEZE_STRUCTURE    = False    # only used if TRAIN_ENCODERS=True
GRAD_CHECKPOINT     = False

# === Misc ===
DEVICE              = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN             = 512
BSZ                 = 2
ACCUM               = 1
LR                  = 5e-5
WARMUP_RATIO        = 0.03
EPOCHS              = 1
OUTPUT_DIR          = "runs/colab_smoketest"
LOG_EVERY           = 1

print("Device:", DEVICE)

Device: cuda


In [None]:
# SUBSET_JSONL = os.path.join(PROJECT_DIR, "train_subset_100.jsonl")

# # Write first 1000 non-empty lines to subset
# count = 0
# with open(DATA_JSONL, "r", encoding="utf-8") as fin, open(SUBSET_JSONL, "w", encoding="utf-8") as fout:
#     for line in fin:
#         if not line.strip():
#             continue
#         fout.write(line)
#         count += 1
#         if count >= 100:
#             break

# print("Wrote subset lines:", count, "->", SUBSET_JSONL)

In [8]:
from train_prefix_qwen import train, parse_args
SUBSET_JSONL = os.path.join(PROJECT_DIR, "train_subset_100.jsonl")

In [None]:
import types, os

SAVE_DIR = os.path.join(PROJECT_DIR, "runs_colab_test")

args = types.SimpleNamespace(
    # Data
    train_file   = DATA_JSONL,
    val_file     = None,
    batch_size   = 1,         # adjust if you want
    accum_steps  = 2,
    max_len      = 1024,       # keep modest for speed
    # Model
    model_name   = MODEL_NAME,
    dtype        = "fp32",    # or "bf16" on A100 for speed
    prefix_len   = 4,         # try 1 or 4+
    prefix_gate  = 0.5,       # stabilizer on the soft prefix
    learnable_gate = False,
    freeze_llm   = False,     # True = projector-only
    train_encoders = True,   # keep ESM encoders frozen for speed
    # Encoders
    protein_config = PROTEIN_CONFIG,
    structure_config = STRUCTURE_CONFIG,
    protrek_ckpt  = PROTREK_CKPT,
    prot_slot     = 1,
    stru_slot     = 3,
    # Optim
    epochs      = 3,
    lr          = 3e-5,       # projector+LLM small LR
    weight_decay= 0.05,
    # Save/eval
    save_dir    = OUT_DIR,
    save_every  = 1500,
    eval_every  = 0,
    # Misc
    seed        = 42,
)

# Kick off training
train(args)

## ️🧪 Test training with `train_prefix_*` (imports `train`, `parse_args`)

In [None]:
%pip -q install --upgrade accelerate transformers peft datasets

In [None]:
import glob, importlib, sys
from pathlib import Path

# Optionally set MODULE_NAME manually, e.g.:
# MODULE_NAME = 'train_prefix_qwen_fsdp'
MODULE_NAME = None

candidates = sorted(glob.glob('train_prefix_*.py'))
print('Found candidates:', candidates)
if MODULE_NAME is None:
    if not candidates:
        raise FileNotFoundError('Put your train_prefix_*.py in the current directory.')
    MODULE_NAME = Path(candidates[-1]).stem  # pick latest by name
print('Importing from module:', MODULE_NAME)

mod = importlib.import_module(MODULE_NAME)
from importlib import reload
mod = reload(mod)
train = getattr(mod, 'train')
parse_args = getattr(mod, 'parse_args')
print('Imported: train, parse_args from', MODULE_NAME)

In [None]:
import os, json
os.makedirs('sft_data', exist_ok=True)
toy = [
    {
        'prompt': 'Describe the likely function of this protein.',
        'response': 'This appears to be an enzyme with possible hydrolase activity.',
        'aa_seq': 'MKTFFVAIATGAFSATA',
        'stru_str': None
    },
    {
        'prompt': 'What domain might this protein contain?',
        'response': 'Likely contains a Rossmann-like fold domain.',
        'aa_seq': 'MGDVEKGKKIFIMKCSQCHTVEKGGKHKTGPNLHGLFGRKTGQAP',
        'stru_str': 'ACDEFGHIKLMNPQRSTVWY'
    }
]
with open('sft_data/train_tiny.jsonl', 'w', encoding='utf-8') as f:
    for ex in toy: f.write(json.dumps(ex)+'\n')
with open('sft_data/val_tiny.jsonl', 'w', encoding='utf-8') as f:
    for ex in toy: f.write(json.dumps(ex)+'\n')
print('Wrote sft_data/train_tiny.jsonl and sft_data/val_tiny.jsonl')

In [None]:
argv = [
  '--train-file', 'sft_data/train_tiny.jsonl',
  '--val-file', 'sft_data/val_tiny.jsonl',
  '--model-name', 'Qwen/Qwen2.5-0.5B-Instruct',
  '--protein-config', 'facebook/esm2_t12_35M_UR50D',
  '--structure-config', 'facebook/esm2_t12_35M_UR50D',
  '--prefix-len', '2',
  '--batch-size', '1',
  '--accum-steps', '1',
  '--max-len', '256',
  '--epochs', '1',
  '--save-dir', 'runs/test',
  '--log-every', '1'
]
print('Args:', argv)
args = parse_args(argv)
print(args)
train(args)


> **Multi-GPU tip (run from a terminal, not inside this notebook cell):**
> ```bash
> accelerate launch --config_file accelerate_fsdp_bf16.yaml ${MODULE_NAME}.py \
>   --train-file sft_data/train_tiny.jsonl \
>   --val-file   sft_data/val_tiny.jsonl \
>   --model-name Qwen/Qwen2.5-7B-Instruct \
>   --protein-config facebook/esm2_t12_35M_UR50D \
>   --structure-config facebook/esm2_t12_35M_UR50D \
>   --prefix-len 4 --batch-size 1 --accum-steps 8 --max-len 2048 \
>   --dtype bf16 --lr 1e-5 --weight-decay 0.05 \
>   --save-dir ./runs --save-every 1000 --eval-every 0
> ```
