# MALTbot — Daily Colab Batch Runner (canonical)

## 원하는 UX (사용자 기준)
1) Colab에서 `notebooks/MALTbot_2.ipynb` 열기
2) 맨 위 **CONFIG 셀만** 수정
   - `BATCH_RUN_NAME`: 예) `2026-02-21_batch1`
   - `EXPERIMENTS`: 예) `["baseline_chgnet", "chgnet_seed43", "chgnet_lr_schedule", ...]`
   - (옵션) `GH_PUSH=True/False`
3) **Run all**

실행 후 자동으로:
- `results/daily/<DATE>/<BATCH_RUN_NAME>/<exp_name>/results.json` 생성 (실험별 1개)
- `RESULTS.md`에 실험별 1줄 append
- GitHub에 브랜치 `colab-<DATE>-<SAFE_BATCH>`로 한 번에 push + PR 링크 출력

> 원칙: `main`에 직접 push하지 않고, PR로만 반영.


In [3]:

import platform
print("Python:", platform.python_version())
try:
    import torch
    print("Torch:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
except Exception as e:
    print("[warn] torch import failed:", repr(e))
    print("[action] Run the torch repair/install cell below, then re-run this cell.")


Python: 3.12.12
Torch: 2.10.0+cu128
CUDA available: True
GPU: NVIDIA L4


In [7]:
try:
    from google.colab import drive
    from pathlib import Path
    drive.mount('/content/drive', force_remount=False)
    if not Path('/content/drive/MyDrive').exists():
        raise RuntimeError("Drive mount path not found.")
    print("Drive mounted: /content/drive/MyDrive")
except Exception as e:
    print(f"[ERROR] Drive mount failed: {e}")
    print("--------------------------------------------------")
    print("[FIX] If you see 'credential propagation' error:")
    print("1. Refresh this page (F5).")
    print("2. Go to Runtime -> Disconnect and delete runtime.")
    print("3. Click 'Run all' again and accept the popup.")
    print("--------------------------------------------------")
    raise e


MessageError: Error: credential propagation was unsuccessful

In [None]:

from google.colab import userdata
import os

# GH_PUSH policy upfront (change to "0" to disable push workflow)
os.environ.setdefault("MALTBOT_GH_PUSH", "1")
print(f"GH_PUSH(default): {os.environ.get('MALTBOT_GH_PUSH')} (1=enabled, 0=disabled)")

if os.environ.get("MALTBOT_GH_PUSH") == "1":
    raw = userdata.get("GH_TOKEN")
    token = raw if isinstance(raw, str) and raw.strip() else None
    if not token:
        raise RuntimeError(
            "GH_TOKEN not found in Colab Secrets.\n"
            "Steps: Left sidebar -> Secrets -> Add key 'GH_TOKEN' with repo push scope -> rerun this cell."
        )
    os.environ["GH_TOKEN"] = token.strip()
    print("GH_TOKEN loaded from Colab Secrets (hidden)")
else:
    print("GH_PUSH=0: token check skipped")


In [4]:
%%bash
set -euo pipefail
REPO_DIR="/content/MALTbot"
REPO_URL="https://github.com/seanwoory/MALTbot.git"

if [ -d "${REPO_DIR}/.git" ]; then
  echo "[info] Repo exists. Force sync to origin/main..."
  cd "${REPO_DIR}"
  git fetch origin
  git checkout main || true
  git reset --hard origin/main
else
  echo "[info] Cloning repo..."
  git clone "${REPO_URL}" "${REPO_DIR}"
  cd "${REPO_DIR}"
  git checkout main || true
fi


[info] Cloning repo...
f422ccf


Cloning into '/content/MALTbot'...


In [2]:

# --- Quick UX ---
# 아래 변수만 바꾸고 Run all:
#   - BATCH_RUN_NAME: 예) 2026-02-21_batch1
#   - EXPERIMENTS: 예) ["baseline_chgnet", "chgnet_seed43", ...]
#   - GH_PUSH: True면 자동 push + PR 링크 출력
# 실행 후 자동으로:
#   - results/daily/<DATE>/<SAFE_BATCH>/<exp_name>/results.json 생성
#   - RESULTS.md에 실험별 1줄 append
#   - GitHub 브랜치 colab-<DATE>-<SAFE_BATCH>로 push
# ----------------

# CONFIG (edit only this cell)
from datetime import datetime
from zoneinfo import ZoneInfo
import os, re, uuid

DATE = datetime.now(ZoneInfo("Asia/Seoul")).strftime("%Y-%m-%d")
BATCH_RUN_NAME = "daily11_alignn"
TASK = "matbench_mp_e_form"
EXPERIMENTS = [
    "alignn_fold0_agile",
]
GH_PUSH = True

# Two-tier run mode: "SMOKE" (ultra-fast) or "FULL" (deeper)
RUN_MODE = "FULL"
SMOKE_FRACTION = 0.0001
FULL_FRACTION = 0.1
SMOKE_EPOCHS = 1
FULL_EPOCHS = 3
SMOKE_PATIENCE = 1
FULL_PATIENCE = 3
SMOKE_BATCH_SIZE = 4
FULL_BATCH_SIZE = 8
# Cache fraction can be decoupled from data fraction (e.g., 0.001 agile cache, 1.0 official cache)
CACHE_FRACTION = 0.001
FOLDS = ["fold_0"]
RUN_ID = uuid.uuid4().hex[:8]

SAFE_BATCH = re.sub(r"[^A-Za-z0-9._-]+", "-", BATCH_RUN_NAME).strip("-") or "batch"

os.environ["MALTBOT_DATE"] = DATE
os.environ["MALTBOT_BATCH_RUN_NAME"] = SAFE_BATCH
os.environ["MALTBOT_TASK"] = TASK
os.environ["MALTBOT_EXPERIMENTS"] = ",".join(EXPERIMENTS)
os.environ["MALTBOT_GH_PUSH"] = "1" if GH_PUSH else "0"
os.environ["MALTBOT_RUN_MODE"] = RUN_MODE
os.environ["MALTBOT_SMOKE_FRACTION"] = str(SMOKE_FRACTION)
os.environ["MALTBOT_FULL_FRACTION"] = str(FULL_FRACTION)
os.environ["MALTBOT_SMOKE_EPOCHS"] = str(SMOKE_EPOCHS)
os.environ["MALTBOT_FULL_EPOCHS"] = str(FULL_EPOCHS)
os.environ["MALTBOT_SMOKE_PATIENCE"] = str(SMOKE_PATIENCE)
os.environ["MALTBOT_FULL_PATIENCE"] = str(FULL_PATIENCE)
os.environ["MALTBOT_SMOKE_BATCH_SIZE"] = str(SMOKE_BATCH_SIZE)
os.environ["MALTBOT_FULL_BATCH_SIZE"] = str(FULL_BATCH_SIZE)
os.environ["MALTBOT_CACHE_FRACTION"] = str(CACHE_FRACTION)
os.environ["MALTBOT_FOLDS"] = ",".join(FOLDS)
os.environ["MALTBOT_RUN_ID"] = RUN_ID

print({
    "DATE": DATE,
    "BATCH_RUN_NAME(raw)": BATCH_RUN_NAME,
    "SAFE_BATCH": SAFE_BATCH,
    "TASK": TASK,
    "EXPERIMENTS": EXPERIMENTS,
    "GH_PUSH": GH_PUSH,
    "RUN_MODE": RUN_MODE,
    "SMOKE_FRACTION": SMOKE_FRACTION,
    "FULL_FRACTION": FULL_FRACTION,
    "SMOKE_EPOCHS": SMOKE_EPOCHS,
    "FULL_EPOCHS": FULL_EPOCHS,
    "SMOKE_BATCH_SIZE": SMOKE_BATCH_SIZE,
    "FULL_BATCH_SIZE": FULL_BATCH_SIZE,
    "CACHE_FRACTION": CACHE_FRACTION,
    "FOLDS": FOLDS,
    "RUN_ID": RUN_ID,
})


{'DATE': '2026-02-23', 'BATCH_RUN_NAME(raw)': 'daily10', 'SAFE_BATCH': 'daily10', 'TASK': 'matbench_mp_e_form', 'EXPERIMENTS': ['chgnet_pretrained_infer', 'chgnet_head_finetune_freeze', 'chgnet_full_finetune', 'chgnet_ensemble3', 'mlp_pretrained_infer_fallback', 'mlp_head_finetune_freeze', 'chgnet_seed43', 'chgnet_seed44', 'chgnet_lr_schedule', 'chgnet_target_transform', 'chgnet_ema', 'chgnet_epochs80_seed43'], 'GH_PUSH': True}


In [1]:
# [FIX] Force reinstall torch to resolve Colab Python 3.12 circular import error
!pip install --upgrade torch --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121


In [5]:
%%bash
set -euo pipefail
LOG=/tmp/maltbot_pip_install.log
echo "[info] Starting FIXED GOLDEN recipe install..." > "$LOG"

install_pkg() {
  echo "[installing] $*" >> "$LOG"
  if ! python3 -m pip install --no-cache-dir "$@" >> "$LOG" 2>&1; then
    echo "[ERROR] Failed to install: $*"
    tail -n 20 "$LOG"
    exit 1
  fi
}

install_pkg "pip>=24.0" "setuptools>=69.0" "wheel"
install_pkg "numpy==1.26.4"
install_pkg "scikit-learn==1.4.1.post1"
install_pkg "matminer==0.9.2"
install_pkg "matbench==0.6"  # Fixed from hallucinated 0.1.6
install_pkg "dgl==2.1.0+cu121" "-f" "https://data.dgl.ai/wheels/cu121/repo.html"
install_pkg "alignn==2024.3.21" "jarvis-tools" "chgnet"

echo "[ok] FINAL FIXED RECIPE applied"


Process is terminated.


In [None]:
import collections, collections.abc, sys, importlib, subprocess
# Comprehensive collections patch
for name in ['Mapping', 'MutableMapping', 'Iterable', 'Sequence', 'Callable', 'Iterator']:
    if not hasattr(collections, name):
        setattr(collections, name, getattr(collections.abc, name))

def ensure_final(pkg, install_cmd):
    try:
        importlib.import_module(pkg)
        print(f"[ok] {pkg} is ready")
    except Exception as e:
        print(f"[critical] {pkg} verification failed: {e!r}")
        print(f"[action] Attempting emergency install for {pkg}...")
        subprocess.check_call(install_cmd)
        importlib.invalidate_caches()
        importlib.import_module(pkg)
        print(f"[ok] {pkg} is now ready after emergency install")

ensure_final("matbench", [sys.executable, "-m", "pip", "install", "matbench"])
ensure_final("dgl", [sys.executable, "-m", "pip", "install", "dgl", "-f", "https://data.dgl.ai/wheels/cu121/repo.html"])
ensure_final("alignn", [sys.executable, "-m", "pip", "install", "alignn"])


In [6]:
%%bash
set -euo pipefail
LOG=/tmp/maltbot_pip_install.log
echo "[info] Starting FIXED GOLDEN recipe install..." > "$LOG"

install_pkg() {
  echo "[installing] $*" >> "$LOG"
  if ! python3 -m pip install --no-cache-dir "$@" >> "$LOG" 2>&1; then
    echo "[ERROR] Failed to install: $*"
    tail -n 20 "$LOG"
    exit 1
  fi
}

install_pkg "pip>=24.0" "setuptools>=69.0" "wheel"
install_pkg "numpy==1.26.4"
install_pkg "scikit-learn==1.4.1.post1"
install_pkg "matminer==0.9.2"
install_pkg "matbench==0.6"  # Fixed from hallucinated 0.1.6
install_pkg "dgl==2.1.0+cu121" "-f" "https://data.dgl.ai/wheels/cu121/repo.html"
install_pkg "alignn==2024.3.21" "jarvis-tools" "chgnet"

echo "[ok] FINAL FIXED RECIPE applied"


[ok] Install debug skipped (no failure marker).


In [None]:

import os, subprocess, sys, time

repo = "/content/MALTbot"
experiments = [x.strip() for x in os.environ.get("MALTBOT_EXPERIMENTS", "").split(",") if x.strip()]
if not experiments:
    raise ValueError("EXPERIMENTS is empty")

gh_push = os.environ.get("MALTBOT_GH_PUSH") == "1"
date = os.environ.get("MALTBOT_DATE")
batch = os.environ.get("MALTBOT_BATCH_RUN_NAME")
run_id = os.environ.get("MALTBOT_RUN_ID", "run")
branch = f"colab-{date}-{batch}-{run_id}" if date and batch else None
stop_on_push_fail = os.environ.get("MALTBOT_STOP_ON_PUSH_FAIL", "1") == "1"


def run_cmd(cmd, check=False, capture=False):
    return subprocess.run(
        cmd,
        cwd=repo,
        check=check,
        text=True,
        capture_output=capture,
    )


def push_with_recovery(base_branch: str):
    # path A: normal push
    push = run_cmd(["git", "push", "-u", "origin", base_branch], capture=True)
    if push.returncode == 0:
        print(f"[push] success: {base_branch}")
        return True, base_branch

    print(f"[push] first attempt failed for {base_branch}")
    if push.stdout:
        print("[push stdout]", push.stdout[-1200:])
    if push.stderr:
        print("[push stderr]", push.stderr[-1200:])

    # path B: fetch + rebase + retry
    print(f"[push] recovery path A: fetch+rebase+retry ({base_branch})")
    run_cmd(["git", "fetch", "origin", base_branch], capture=True)
    rb = run_cmd(["git", "rebase", f"origin/{base_branch}"], capture=True)
    if rb.returncode != 0:
        run_cmd(["git", "rebase", "--abort"], capture=True)
        print("[push] rebase failed; trying fallback branch")

    push2 = run_cmd(["git", "push", "-u", "origin", base_branch], capture=True)
    if push2.returncode == 0:
        print(f"[push] recovery A success: {base_branch}")
        return True, base_branch

    # path C: fallback unique branch
    fallback = f"{base_branch}-r{int(time.time())}"
    print(f"[push] recovery path B: fallback branch {fallback}")
    run_cmd(["git", "checkout", "-B", fallback], capture=True)
    push3 = run_cmd(["git", "push", "-u", "origin", fallback], capture=True)
    if push3.returncode == 0:
        print(f"[push] recovery B success: {fallback}")
        print(f"[push] PR URL: https://github.com/seanwoory/MALTbot/compare/main...{fallback}?expand=1")
        return True, fallback

    print("[push] recovery failed")
    if push3.stdout:
        print("[push3 stdout]", push3.stdout[-1200:])
    if push3.stderr:
        print("[push3 stderr]", push3.stderr[-1200:])
    return False, base_branch


if gh_push:
    token = os.environ.get("GH_TOKEN", "")
    if not token:
        raise ValueError("GH_PUSH=True but GH_TOKEN is not set")
    run_cmd(["git", "checkout", "-B", branch], check=False)
    run_cmd(["git", "config", "user.name", "colab-bot"], check=True)
    run_cmd(["git", "config", "user.email", "colab-bot@users.noreply.github.com"], check=True)
    run_cmd(["git", "remote", "set-url", "origin", f"https://{token}@github.com/seanwoory/MALTbot.git"], check=True)

for exp in experiments:
    print(f"\n===== RUN {exp} =====")
    cmd = [sys.executable, "scripts/run_experiment.py", "--exp-name", exp]

    proc = subprocess.Popen(
        cmd,
        cwd=repo,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
    )

    out_lines = []
    assert proc.stdout is not None
    for line in proc.stdout:
        print(line, end="")
        out_lines.append(line)

    return_code = proc.wait()
    print(f"status code: {return_code}")

    if gh_push and branch:
        exp_path = f"results/daily/{date}/{batch}/{exp}/results.json"
        run_cmd(["git", "add", exp_path, "RESULTS.md"], check=False)
        diff = run_cmd(["git", "diff", "--cached", "--quiet"], check=False)
        if diff.returncode != 0:
            msg = f"results: {exp}"
            run_cmd(["git", "commit", "-m", msg], check=False)
            ok, used_branch = push_with_recovery(branch)
            if not ok and stop_on_push_fail:
                raise RuntimeError("Push failed after all recovery paths and STOP_ON_PUSH_FAIL=True")
            if used_branch != branch:
                branch = used_branch
        else:
            print(f"[no changes] {exp}")

print("Batch finished")


In [None]:

%%bash
set -euo pipefail
cd /content/MALTbot
find "results/daily/${MALTBOT_DATE}/${MALTBOT_BATCH_RUN_NAME}" -type f -name "results.json" | sort || true
tail -n 20 RESULTS.md


In [None]:

print("Token already checked upfront. Skipping duplicate token prompt.")


In [None]:

%%bash
set -euo pipefail
cd /content/MALTbot

if [ "${MALTBOT_GH_PUSH}" != "1" ]; then
  echo "GH_PUSH=False; skip preflight"
  exit 0
fi

: "${GH_TOKEN:?GH_TOKEN is not set}"
: "${MALTBOT_DATE:?MALTBOT_DATE missing}"
: "${MALTBOT_BATCH_RUN_NAME:?MALTBOT_BATCH_RUN_NAME missing}"

BRANCH="colab-${MALTBOT_DATE}-${MALTBOT_BATCH_RUN_NAME}"

git checkout -B "${BRANCH}" >/dev/null 2>&1 || git checkout "${BRANCH}" >/dev/null 2>&1

git remote set-url origin "https://${GH_TOKEN}@github.com/seanwoory/MALTbot.git"

echo "Preflight OK: token set, origin updated, branch=${BRANCH}"


In [None]:

%%bash
set -euo pipefail
cd /content/MALTbot

if [ "${MALTBOT_GH_PUSH}" != "1" ]; then
  echo "GH_PUSH=False; skip git push"
  exit 0
fi

: "${GH_TOKEN:?GH_TOKEN is not set}"
: "${MALTBOT_DATE:?MALTBOT_DATE missing}"
: "${MALTBOT_BATCH_RUN_NAME:?MALTBOT_BATCH_RUN_NAME missing}"

RUN_ID="${MALTBOT_RUN_ID:-run}"
BRANCH="colab-${MALTBOT_DATE}-${MALTBOT_BATCH_RUN_NAME}-${RUN_ID}"

git config user.name "colab-bot"
git config user.email "colab-bot@users.noreply.github.com"
git checkout -B "${BRANCH}"
git remote set-url origin "https://${GH_TOKEN}@github.com/seanwoory/MALTbot.git"

git add "results/daily/${MALTBOT_DATE}/${MALTBOT_BATCH_RUN_NAME}" RESULTS.md || true
if ! git diff --cached --quiet; then
  git commit -m "results: ${MALTBOT_DATE} ${MALTBOT_BATCH_RUN_NAME}" || true
fi

# idempotent non-interactive push with recovery
if git push -u origin "${BRANCH}"; then
  echo "[ok] pushed ${BRANCH}"
else
  echo "[warn] push rejected for ${BRANCH}; trying fetch+rebase+retry"
  git fetch origin "${BRANCH}" || true
  git pull --rebase --autostash origin "${BRANCH}" || true
  if git push -u origin "${BRANCH}"; then
    echo "[ok] pushed after rebase ${BRANCH}"
  else
    FALLBACK="${BRANCH}-r$(date +%s)"
    echo "[warn] retry failed; fallback branch ${FALLBACK}"
    git checkout -B "${FALLBACK}"
    git push -u origin "${FALLBACK}"
    BRANCH="${FALLBACK}"
    echo "[ok] pushed fallback branch ${BRANCH}"
  fi
fi

echo "Create PR: https://github.com/seanwoory/MALTbot/compare/main...${BRANCH}?expand=1"
