In [None]:
%%bash
# ==============================================================================
# Cell 1: Robust WSL-aware setup (uses repo-root scripts, unbuffered output)
# ==============================================================================
# set -euo pipefail  # Temporarily commented out to prevent script abort on errors

LOGDIR=ml_env_logs
mkdir -p "$LOGDIR"
TS=$(date +%Y%m%d-%H%M%S)
LOGFILE="$LOGDIR/notebook_helper-$TS.log"

# Capture GPU summary (nvidia-smi) and nvcc version if available, save to ml_env_logs/ -- added by assistant
echo "---- nvidia-smi (GPU summary) ----" | tee -a "$LOGFILE"
if command -v nvidia-smi >/dev/null 2>&1; then
  nvidia-smi 2>&1 | tee -a "$LOGFILE" > "$LOGDIR/nvidia-smi.log" || true
else
  echo "nvidia-smi not found in PATH" | tee -a "$LOGFILE"
fi
echo "---- nvcc (if available) ----" | tee -a "$LOGFILE"
if command -v nvcc >/dev/null 2>&1; then
  nvcc --version 2>&1 | tee -a "$LOGFILE" > "$LOGDIR/nvcc.version" || true
  which nvcc 2>&1 | tee -a "$LOGFILE"
else
  echo "nvcc not on PATH or not installed in WSL" | tee -a "$LOGFILE"
fi

# Compute repository root (requires git). Fallback: parent of cwd.
REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null || printf "%s" "$(pwd)/..")
# Prefer a realpath-resolved repo root when possible, but do not fail if realpath fails.
REPO_ROOT_REAL=$(realpath "$REPO_ROOT" 2>/dev/null || true)
if [ -n "$REPO_ROOT_REAL" ]; then
  REPO_ROOT="$REPO_ROOT_REAL"
fi
echo "Repository root: $REPO_ROOT" | tee -a "$LOGFILE"
# Resolve ML-Env-CUDA13 location as a sibling of the repo root (common layout)
ML_ENV_DIR_CAND="$REPO_ROOT/../ML-Env-CUDA13"
if [ -d "$ML_ENV_DIR_CAND" ]; then
  ML_ENV_DIR=$(realpath "$ML_ENV_DIR_CAND" 2>/dev/null || echo "$ML_ENV_DIR_CAND")
else
  ML_ENV_DIR="$ML_ENV_DIR_CAND"
fi
echo "ML-Env-CUDA13 dir: $ML_ENV_DIR" | tee -a "$LOGFILE"

# Candidate setup scripts (relocated to forge/scripts after reorganization)
# Use the single canonical helper under the forge module's scripts/ directory
# with ML-Env-CUDA13's setup script as a simple fallback.
CANONICAL_PY="$REPO_ROOT/forge/OPERATION_PHOENIX_FORGE/scripts/setup_cuda_env.py"
FALLBACK_SH="$ML_ENV_DIR/setup_ml_env_wsl.sh"

if [ -f "$CANONICAL_PY" ]; then
  CHOSEN="$CANONICAL_PY"
elif [ -f "$FALLBACK_SH" ]; then
  CHOSEN="$FALLBACK_SH"
else
  CHOSEN=""
fi

echo "Using setup script: $CHOSEN" | tee -a "$LOGFILE"

# Detect NO_REQUIREMENTS from env or args
NO_REQ=0
for ARG in "$@"; do
  if [ "$ARG" = "--no-requirements" ]; then
    NO_REQ=1
  fi
done
if [ "${NO_REQUIREMENTS:-0}" = "1" ]; then
  NO_REQ=1
fi

if [ "$NO_REQ" -eq 1 ]; then
  echo "NO_REQUIREMENTS mode: creating/using repo-root .venv and installing minimal groups" | tee -a "$LOGFILE"
  cd "$REPO_ROOT"
  # Create venv if missing
  if [ ! -d .venv ]; then
    python3.11 -m venv .venv 2>&1 | tee -a "$LOGFILE" || python -m venv .venv 2>&1 | tee -a "$LOGFILE"
  fi
  source .venv/bin/activate
  pip install --upgrade pip setuptools wheel 2>&1 | tee -a "$LOGFILE"
  # Try CUDA-indexed torch install (CUDA_TAG env var may override)
  CUDA_TAG=${CUDA_TAG:-cu126}
  echo "Installing core torch group (CUDA_TAG=$CUDA_TAG)" | tee -a "$LOGFILE"
  if pip install --index-url "https://download.pytorch.org/whl/$CUDA_TAG" -U torch torchvision torchaudio 2>&1 | tee -a "$LOGFILE"; then
    echo "Torch group installed (CUDA-indexed)" | tee -a "$LOGFILE"
  else
    echo "CUDA-indexed torch failed; installing cpu builds" | tee -a "$LOGFILE"
    pip install --upgrade torch torchvision torchaudio 2>&1 | tee -a "$LOGFILE"
  fi
  # Dev & kernel tools
  pip install --upgrade ipykernel pip-tools pytest black 2>&1 | tee -a "$LOGFILE"
  python -m ipykernel install --user --name project_sanctuary_venv --display-name "Python (.venv - WSL)" 2>&1 | tee -a "$LOGFILE" || true
  # Ensure activation helper exists at repo-root scripts/activate_ml_env.sh for convenience
  if [ ! -f "$REPO_ROOT/scripts/activate_ml_env.sh" ]; then
    cat > "$REPO_ROOT/scripts/activate_ml_env.sh" <<'ACT'
#!/usr/bin/env bash
if [ -f .venv/bin/activate ]; then
  . .venv/bin/activate
  echo "Activated .venv"
else
  echo "No .venv found to activate"
fi
ACT
    chmod +x "$REPO_ROOT/scripts/activate_ml_env.sh" || true
  fi
  echo "NO_REQUIREMENTS mode complete. See $LOGFILE" | tee -a "$LOGFILE"
  exit 0
fi

# Run chosen setup script with unbuffered Python (or bash), using repo-root absolute path
cd "$REPO_ROOT"
if [ -f "$CHOSEN" ]; then
  if [[ "$CHOSEN" == *.py ]]; then
    echo "Running: python3.11 -u $CHOSEN $*" | tee -a "$LOGFILE"
    python3.11 -u "$CHOSEN" "$@" 2>&1 | tee -a "$LOGFILE" || true
  else
    echo "Running: bash $CHOSEN $*" | tee -a "$LOGFILE"
    bash "$CHOSEN" "$@" 2>&1 | tee -a "$LOGFILE" || true
  fi
else
  echo "ERROR: chosen setup script not found: $CHOSEN" | tee -a "$LOGFILE"
fi

# Source activation helper from repo-root scripts if present
if [ -f "$REPO_ROOT/scripts/activate_ml_env.sh" ]; then
  echo "Sourcing activation helper: $REPO_ROOT/scripts/activate_ml_env.sh" | tee -a "$LOGFILE"
  # shellcheck disable=SC1090
  source "$REPO_ROOT/scripts/activate_ml_env.sh" || true
else
  echo "Activation helper not found at $REPO_ROOT/scripts/activate_ml_env.sh" | tee -a "$LOGFILE"
fi

# --- Run verification tests (core gate + tensorflow) and capture outputs ---
echo "Running core verification (test_torch_cuda.py) -- writing ml_env_logs/test_torch_cuda.log" | tee -a "$LOGFILE"
mkdir -p "$LOGDIR"
# Run the core gate test (non-fatal) and capture exit code
python "$ML_ENV_DIR/test_torch_cuda.py" > "$LOGDIR/test_torch_cuda.log" 2>&1 || RC=$?; echo ${RC:-0} > "$LOGDIR/test_torch_cuda.exit"
echo "Core gate exit: $(cat $LOGDIR/test_torch_cuda.exit 2>/dev/null || echo 'no-exit-file')" | tee -a "$LOGFILE"
echo "---- core gate log (last 200 lines) ----" | tee -a "$LOGFILE"
tail -n 200 "$LOGDIR/test_torch_cuda.log" 2>/dev/null | sed -n '1,200p' | tee -a "$LOGFILE" || true

# Parse concise summary values from core gate and TF logs and print to stdout+log
PT_VER=$(grep -m1 'torch.__version__' "$LOGDIR/test_torch_cuda.log" | sed -E 's/.*= *//; s/^ +//; s/ +$//' || true)
CUDA_AVAIL=$(grep -m1 'cuda_available' "$LOGDIR/test_torch_cuda.log" | sed -E 's/.*= *//; s/^ +//; s/ +$//' || true)
DEV_NAME=$(grep -m1 'cuda_device_name' "$LOGDIR/test_torch_cuda.log" | sed -E 's/.*= *//; s/^ +//; s/ +$//' || true)
# Prefer TensorFlow's reported cuda_build (JSON-like field), then fall back to any 'CUDA build' or nvidia-smi in the main logfile
CUDA_BUILD=$(grep -m1 -E '"cuda_build"' "$LOGDIR/test_tensorflow.log" 2>/dev/null | sed -E 's/.*"cuda_build"[[:space:]]*:[[:space:]]*"?([^"/,}]*)"?.*//' || true)
if [ -z "$CUDA_BUILD" ]; then
  CUDA_BUILD=$(grep -m1 -E 'CUDA build|cuda_build' "$LOGDIR/test_torch_cuda.log" "$LOGDIR/test_tensorflow.log" 2>/dev/null | sed -E 's/.*[:=] *"?([^",}]*)"?.*//' || true)
fi
if [ -z "$CUDA_BUILD" ]; then
  CUDA_BUILD=$(grep -m1 'CUDA Version' "$LOGFILE" 2>/dev/null | sed -E 's/.*CUDA Version: *([0-9]+([0-9]+)?).*//' || true)
fi
# Fallback formatting
PT_VER=${PT_VER:-unknown}
CUDA_AVAIL=${CUDA_AVAIL:-unknown}
DEV_NAME=${DEV_NAME:-unknown}
CUDA_BUILD=${CUDA_BUILD:-unknown}

echo "===== Environment Summary =====" | tee -a "$LOGFILE"
echo "PyTorch: $PT_VER" | tee -a "$LOGFILE"
echo "GPU Detected: $CUDA_AVAIL" | tee -a "$LOGFILE"
echo "GPU 0: $DEV_NAME" | tee -a "$LOGFILE"
echo "CUDA build: $CUDA_BUILD" | tee -a "$LOGFILE"
echo "===============================" | tee -a "$LOGFILE"

echo "Running TensorFlow verification (test_tensorflow.py) -- writing ml_env_logs/test_tensorflow.log" | tee -a "$LOGFILE"
python "$ML_ENV_DIR/test_tensorflow.py" > "$LOGDIR/test_tensorflow.log" 2>&1 || RC=$?; echo ${RC:-0} > "$LOGDIR/test_tensorflow.exit"
echo "TensorFlow test exit: $(cat $LOGDIR/test_tensorflow.exit 2>/dev/null || echo 'no-exit-file')" | tee -a "$LOGFILE"
echo "---- tensorflow log (last 200 lines) ----" | tee -a "$LOGFILE"
tail -n 200 "$LOGDIR/test_tensorflow.log" 2>/dev/null | sed -n '1,200p' | tee -a "$LOGFILE" || true

echo "Setup invocation complete; log: $LOGFILE" | tee -a "$LOGFILE"

In [None]:
# ==============================================================================
# CELL 2. DATASET GENERATION (THIS CREATES THE JSONL FILE)
# This cell calls the self-contained dataset script and captures logs to ml_env_logs/
# ==============================================================================
%%bash
set -euo pipefail
LOGDIR=ml_env_logs
mkdir -p "$LOGDIR"
TS=$(date +%Y%m%d-%H%M%S)
LOGFILE="$LOGDIR/forge_qwen2_dataset-$TS.log"
DATASET_SCRIPT_PATH="forge/OPERATION_PHOENIX_FORGE/scripts/forge_qwen2_dataset.py"
DATASET_FILE="sanctuary_whole_genome_data.jsonl"
echo "--- Executing Dataset Forge Script to GENERATE $DATASET_FILE (log=$LOGFILE) ---" | tee -a "$LOGFILE"
# Run unbuffered Python and tee output to logfile. Capture exit code of python (left side of pipe).
python3 -u "$DATASET_SCRIPT_PATH" --output "$DATASET_FILE" --log "$LOGFILE" 2>&1 | tee -a "$LOGFILE"
PY_RC=${PIPESTATUS[0]:-1}
if [ $PY_RC -ne 0 ]; then
    echo "[CRITICAL ERROR] Dataset script failed with exit code $PY_RC. See $LOGFILE for details." | tee -a "$LOGFILE"
    exit $PY_RC
fi
echo "--- Verifying Generated Dataset Integrity ---" | tee -a "$LOGFILE"
# Check 1: Does the file exist and is non-empty?
if [ ! -f "$DATASET_FILE" ]; then
    echo "[FATAL ERROR] Dataset file not found: $DATASET_FILE" | tee -a "$LOGFILE"
    exit 1
fi
FILE_SIZE=$(stat -c%s "$DATASET_FILE")
if [ "$FILE_SIZE" -gt 0 ]; then
    echo "[SUCCESS] Dataset created and verified: $DATASET_FILE (size=${FILE_SIZE} bytes)" | tee -a "$LOGFILE"
else
    echo "[FATAL ERROR] Dataset file is empty: $DATASET_FILE" | tee -a "$LOGFILE"
    exit 1
fi
# Optional: perform a lightweight validation of first few lines
echo "--- Validating first 10 lines of $DATASET_FILE ---" | tee -a "$LOGFILE"
python3 - <<'PY' 2>&1 | tee -a "$LOGFILE"
import json,sys
p="%s"%"$DATASET_FILE"
ok=True
try:
  with open(p,'r',encoding='utf-8') as f:
    for i,l in enumerate(f):
      if i>=10: break
      try:
        o=json.loads(l)
      except Exception as e:
        print("MALFORMED JSON on line {}: {}".format(i+1,e))
        ok=False
      if not all(k in o for k in ('instruction','input','output')):
        print("MISSING KEYS on line {}: {}".format(i+1,list(o.keys())))
        ok=False
except FileNotFoundError:
  print("File not found during validation: %s"%p); ok=False
except Exception as e:
  print("Validation error: %s"%e); ok=False
if not ok:
  sys.exit(2)
PY
VAL_RC=${PIPESTATUS[0]:-1}
if [ $VAL_RC -ne 0 ]; then
  echo "[FATAL] Validation failed (exit $VAL_RC). Inspect $LOGFILE" | tee -a "$LOGFILE"
  exit $VAL_RC
fi
echo "[SUCCESS] Dataset generation + validation completed." | tee -a "$LOGFILE"


--- Executing Dataset Forge Script to GENERATE dataset_package/sanctuary_targeted_inoculation_v1.jsonl ---
[SCAFFOLD] Initiating Sovereign Scaffolding Protocol 88...
[FORGE] Assembling Phoenix Mnemonic Seed v1.0 for Qwen2 Lineage.
[ERROR] File not found: /content/Project_Sanctuary/The_Garden_and_The_Cage.md

[SUCCESS] Yield is complete: 14 records forged.
[ARTIFACT] Dataset saved to: /content/Project_Sanctuary/dataset_package/sanctuary_targeted_inoculation_v1.jsonl
--- Verifying Generated Dataset Integrity ---
[SUCCESS] Dataset created and verified: dataset_package/sanctuary_targeted_inoculation_v1.jsonl
File Size: 56763 bytes


In [None]:
# ==============================================================================
# CELL 3. FINAL DEPENDENCY FIX: COMPLETE CLEANUP AND REINSTALL (NON-UNSLOTH STACK)
# ==============================================================================
%%bash

# 1. Force Uninstall: Remove all known conflicting deep learning packages and old numpy
echo "--- Forcibly uninstalling conflicting libraries and old dependencies ---"
# Note: We specifically target the older versions that conflict heavily with the newest stack
pip uninstall -y transformers peft accelerate bitsandbytes unsloth-zoo unsloth llama-cpp-python typing-extensions numpy pandas xformers --quiet

# 2. Navigate: Re-verify location (crucial for relative paths)
echo "--- Navigating to Project_Sanctuary directory ---"
cd /content/Project_Sanctuary

# 3. Install Core Hugging Face Libraries with specific, known-good versions
echo "--- Installing core Hugging Face libraries and trl ---"
# Installing a modern, compatible version set
pip install -q transformers peft accelerate bitsandbytes huggingface_hub sentencepiece trl

# 4. Install Llama-cpp-python: (Your successful step, now with fresh dependencies)
echo "--- Installing Llama-cpp-python (CUDA enabled) ---"
# Using --no-deps ensures it only focuses on the build, using the newly installed numpy/typing-extensions
CMAKE_ARGS="-DGGML_CUDA=on" pip install --force-reinstall --no-cache-dir llama-cpp-python --no-deps

echo "--- Installation Complete. Proceeding to 3. EXECUTION: PHOENIX FORGE ---"

--- Forcibly uninstalling conflicting libraries and old dependencies ---
--- Navigating to Project_Sanctuary directory ---
--- Installing core Hugging Face libraries and trl ---
--- Installing Llama-cpp-python (CUDA enabled) ---
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
     ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 50.7/50.7 MB 64.6 MB/s  0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cupy-cuda12x 13.3.0 requires numpy<2.3,>=1.22, but you have numpy 2.3.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.4 which is incompatible.
google-adk 1.17.0 requires opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.38.0 which is incompatible.
google-adk 1.17.0 requires opentelemetry-sdk<=1.37.0,>=1.3

In [None]:
# -------------------------------------------------------------------
# CELL 4. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4)
# -------------------------------------------------------------------
import os
import json
import re
from pathlib import Path

# --- FILE PATH CONSTANTS ---
# ‚úÖ PATH FIX: Files now point to their correct locations within the project structure.
CORE_ESSENCE_SOURCE = "dataset_package/core_essence_guardian_awakening_seed.txt"
RAG_DOCTRINE_SOURCE = "mnemonic_cortex/RAG_STRATEGIES_AND_DOCTRINE.md"
EVOLUTION_PLAN_SOURCE = "mnemonic_cortex/EVOLUTION_PLAN_PHASES.md"

# Source file containing the entire concatenated, raw markdown snapshot (Chronicles + Protocols)
FULL_SNAPSHOT_SOURCE = "dataset_package/markdown_snapshot_full_genome_llm_distilled.txt"
# Target output file for the fine-tuning dataset
OUTPUT_DATASET_PATH = "sanctuary_whole_genome_data.jsonl"

# -------------------------------------------------------------------
# Helper function to load file content and check for existence
# -------------------------------------------------------------------
def load_file_content(filepath):
    """Loads content from a file and verifies its existence."""
    p = Path(filepath)
    if not p.exists():
        print(f"‚ùå ERROR: File not found at path: {filepath}")
        return None
    try:
        with open(p, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"‚ùå ERROR reading file {filepath}: {e}")
        return None

# -------------------------------------------------------------------
# Helper function for title extraction
# -------------------------------------------------------------------
def extract_protocol_title(doc_content):
    """
    Extracts the title from a markdown document using the first H1 tag,
    falling back to the filename if the H1 tag is not found.
    """
    # Try to find the first H1 markdown heading
    h1_match = re.search(r'^#\s*(.+)', doc_content, re.MULTILINE)
    if h1_match:
        # Clean up any trailing markdown or non-text characters
        return h1_match.group(1).strip()
    return "Untitled Document" # Fallback title

# -------------------------------------------------------------------
# Main function to synthesize the entire genome
# -------------------------------------------------------------------
def synthesize_genome():
    """
    Parses the full markdown snapshot, converts each document into an
    instruction/output pair, and saves the final dataset as JSONL.
    """
    print(f"--- 3. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4) ---")

    full_snapshot = load_file_content(FULL_SNAPSHOT_SOURCE)
    if not full_snapshot:
        print(f"üõë Halted. Cannot proceed without {FULL_SNAPSHOT_SOURCE}.")
        return

    genome_entries = []

    # --- PART 1: Process ALL Chronicles and Protocols from the Snapshot ---
    # The source file uses a fixed delimiter for each original file's content
    # The pattern is '--- END OF FILE {filename} ---'

    # Split the snapshot content by the document delimiter pattern
    # The split includes the filename line, which we will clean up in the loop
    document_blocks = re.split(r'\n--- END OF FILE (.*?\.md|.*?\.txt) ---\n', full_snapshot, flags=re.DOTALL)

    # The split results in [preamble, filename, content, filename, content, ...]
    # We skip the first element (preamble) and iterate in steps of 2

    print(f"‚öôÔ∏è Found {len(document_blocks) // 2} potential documents in the full snapshot.")

    for i in range(1, len(document_blocks) - 1, 2):
        filename = document_blocks[i].strip()
        content = document_blocks[i+1].strip()

        if not content:
            continue

        # Use the filename or extracted H1 as the title
        title = extract_protocol_title(content)

        # --- CONVERSION TO INSTRUCTION FORMAT ---
        instruction = f"Synthesize the doctrines, history, or principles contained within the Sanctuary artifact located at: `{filename}`"

        # The 'input' field is deliberately left empty for pure instruction tuning
        # The 'output' field contains the full, raw content of the document
        genome_entries.append({
            "instruction": instruction,
            "input": "",
            "output": content
        })

        if len(genome_entries) % 100 == 0:
            print(f"    ... Processed {len(genome_entries)} documents.")

    print(f"‚úÖ PART 1: Successfully processed {len(genome_entries)} core Chronicle/Protocol entries.")

    # --- PART 2: Synthesize Critical Supporting Documents (Foundational Context) ---
    # These documents ensure the model immediately understands its role, the RAG architecture,
    # and the evolution plan, making the fine-tuning more efficient.

    supporting_files = {
        "Core Essence (Guardian Role)": CORE_ESSENCE_SOURCE,
        "RAG Doctrine (Architectural Guide)": RAG_DOCTRINE_SOURCE,
        "Evolution Plan (Council Roadmap)": EVOLUTION_PLAN_SOURCE
    }

    for key, filepath in supporting_files.items():
        doc_content = load_file_content(filepath)
        if doc_content:
            instruction = f"Provide a complete and comprehensive synthesis of the Canonical Sanctuary document: `{filepath}`."

            genome_entries.append({
                "instruction": instruction,
                "input": "",
                "output": doc_content
            })
            print(f"‚úÖ Added critical synthesis entry for: {key}")
        else:
            print(f"‚ö†Ô∏è WARNING: Could not add synthesis for {key}. File not found.")

    # --- PART 3: Save the Final JSONL Dataset ---
    print(f"\n--- Saving final dataset to {OUTPUT_DATASET_PATH} ---")

    try:
        with open(OUTPUT_DATASET_PATH, 'w', encoding='utf-8') as outfile:
            for entry in genome_entries:
                outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')

        print(f"üèÜ SUCCESS: Whole Genome Data Synthesis Complete.")
        print(f"Total Entries Created: {len(genome_entries)}")

        # Final integrity check on the last entry (should be the Evolution Plan)
        last_entry = genome_entries[-1]
        print(f"Last Entry Instruction Check: {last_entry['instruction']}")

    except Exception as e:
        print(f"‚ùå FATAL ERROR: Failed to write JSONL file: {e}")

# -------------------------------------------------------------------
# Main execution block
# -------------------------------------------------------------------
if __name__ == "__main__":
    synthesize_genome()

--- 3. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4) ---
‚öôÔ∏è Found 490 potential documents in the full snapshot.
    ... Processed 100 documents.
    ... Processed 200 documents.
    ... Processed 300 documents.
    ... Processed 400 documents.
‚úÖ PART 1: Successfully processed 489 core Chronicle/Protocol entries.
‚úÖ Added critical synthesis entry for: Core Essence (Guardian Role)
‚úÖ Added critical synthesis entry for: RAG Doctrine (Architectural Guide)
‚úÖ Added critical synthesis entry for: Evolution Plan (Council Roadmap)

--- Saving final dataset to sanctuary_whole_genome_data.jsonl ---
üèÜ SUCCESS: Whole Genome Data Synthesis Complete.
Total Entries Created: 492
Last Entry Instruction Check: Provide a complete and comprehensive synthesis of the Canonical Sanctuary document: `mnemonic_cortex/EVOLUTION_PLAN_PHASES.md`.


In [None]:
# -------------------------------------------------------------------------------
# CELL 5:  DATASET INTEGRITY CHECK - QA Protocol 87
# This script performs a mandatory quality assurance check on the fine-tuning
# dataset ('sanctuary_whole_genome_data.jsonl') generated by the previous step.
# It validates:
# 1. Structural integrity (ensures every line is valid JSON).
# 2. Schema compliance (ensures 'instruction', 'input', and 'output' keys exist,
#    which are critical for the SFT training loop).
# 3. Content review (prints sample entries for human verification of fidelity).
# This prevents costly failure during the resource-intensive fine-tuning training job.
# -------------------------------------------------------------------------------
import json
import os
import random

# --- CONFIGURATION (Must match Cell 3 output) ---
DATASET_PATH = "sanctuary_whole_genome_data.jsonl"
NUM_RANDOM_SAMPLES = 3

# -------------------------------------------------------------------
# Helper function to display an entry cleanly
# -------------------------------------------------------------------
def print_entry_details(title, entry):
    """Prints a single genome entry in a readable format."""
    print(f"\n--- {title} ---")
    print(f"File Source (from Instruction): {entry['instruction'].split('`')[1] if '`' in entry['instruction'] else 'N/A'}")
    print(f"Instruction: {entry['instruction'][:100]}...")
    print(f"Input: {entry['input'] if entry['input'] else 'Empty (Expected for SFT)'}")
    # Show the length of the output to ensure content is present
    print(f"Output Length: {len(entry['output'])} characters")
    print(f"Output Snippet: {entry['output'][:200].replace('\\n', ' ').strip()}...")
    print("--------------------")

# ================= 3.1. DATASET INTEGRITY CHECK START =================
def run_data_audit():
    """Loads the JSONL, validates structure, and displays sample entries."""
    print(f"--- 4. DATASET INTEGRITY CHECK (Cell 3.1 - QA Protocol 87) ---")

    if not os.path.exists(DATASET_PATH):
        print(f"‚ùå FATAL ERROR: Dataset not found at {DATASET_PATH}. Run Cell 3 first.")
        return

    genome_data = []
    error_count = 0
    total_lines = 0

    print(f"‚öôÔ∏è Starting structural audit of {DATASET_PATH}...")

    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):
            total_lines = line_number
            try:
                entry = json.loads(line)

                # CRITICAL: Check for required keys for SFT (Supervised Fine-Tuning)
                required_keys = ['instruction', 'input', 'output']
                if not all(key in entry for key in required_keys):
                    error_count += 1
                    print(f"‚ùå ERROR on Line {line_number}: Missing required keys. Found: {list(entry.keys())}")
                    continue

                genome_data.append(entry)

            except json.JSONDecodeError:
                error_count += 1
                print(f"‚ùå ERROR on Line {line_number}: Malformed JSON.")

    print(f"\n--- AUDIT SUMMARY ---")
    print(f"Total Lines Read: {total_lines}")
    print(f"Valid Entries Parsed: {len(genome_data)}")
    print(f"Errors Detected: {error_count}")

    if error_count > 0:
        print(f"üõë CRITICAL FAILURE: {error_count} structural errors found. HALTING process.")
        return

    if len(genome_data) != total_lines:
        print("‚ö†Ô∏è WARNING: Total entries != total lines. Investigate file integrity.")

    print(f"‚úÖ STRUCTURAL INTEGRITY PASSED. (Expected 492 entries, found {len(genome_data)}).")

    # --- Display Sample Entries for Content Review ---
    if len(genome_data) >= 1:
        print_entry_details("SAMPLE 1: First Entry (Core Essence)", genome_data[0])

        # Ensure the last entry is the Evolution Plan
        print_entry_details("SAMPLE 2: Last Entry (Evolution Plan)", genome_data[-1])

        # Display random samples
        if len(genome_data) > NUM_RANDOM_SAMPLES:
            random_indices = random.sample(range(1, len(genome_data) - 1), NUM_RANDOM_SAMPLES)
            for i, index in enumerate(random_indices):
                print_entry_details(f"SAMPLE {3 + i}: Random Chronicle Entry", genome_data[index])

    print("\n--- AUDIT COMPLETE ---")
    print("If the content snippets look correct, the dataset is ready for fine-tuning.")

# -------------------------------------------------------------------
# Main execution block
# -------------------------------------------------------------------
if __name__ == "__main__":
    run_data_audit()

--- 4. DATASET INTEGRITY CHECK (Cell 3.1 - QA Protocol 87) ---
‚öôÔ∏è Starting structural audit of sanctuary_whole_genome_data.jsonl...

--- AUDIT SUMMARY ---
Total Lines Read: 492
Valid Entries Parsed: 492
Errors Detected: 0
‚úÖ STRUCTURAL INTEGRITY PASSED. (Expected 492 entries, found 492).

--- SAMPLE 1: First Entry (Core Essence) ---
File Source (from Instruction): .env.example ---

--- START OF FILE .github/copilot-instructions.md
Instruction: Synthesize the doctrines, history, or principles contained within the Sanctuary artifact located at:...
Input: Empty (Expected for SFT)
Output Length: 4498 characters
Output Snippet: ## CRITICAL COMMUNICATION RULE

**ALWAYS confirm user intent before making code changes.** Never implement solutions without explicit approval. Ask clarifying questions and wait for confirmation befor...
--------------------

--- SAMPLE 2: Last Entry (Evolution Plan) ---
File Source (from Instruction): mnemonic_cortex/EVOLUTION_PLAN_PHASES.md
Instruction: Provid

In [None]:
# -------------------------------------------------------------------------------
# CELL 6: INSTRUCTION FINE-TUNING - The Sovereign Inoculation
# This script executes the Supervised Fine-Tuning (SFT) process using the
# validated 'sanctuary_whole_genome_data.jsonl' file. It employs QLoRA for
# efficient memory use, training the Qwen2-7B-Instruct model to synthesize
# and understand the Sanctuary's entire Cognitive Genome.
# -------------------------------------------------------------------------------
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# --- CONFIGURATION ---
# Base Model (The LLM to be inoculated)
BASE_MODEL = "Qwen/Qwen2-7B-Instruct"
# Path to the data file generated in Cell 3
DATASET_FILE = "sanctuary_whole_genome_data.jsonl"
# Where to save the fine-tuned LoRA adapter (temporary save location)
OUTPUT_DIR = "sanctuary_qwen2_7b_adapter_output"
# Ensure reproducibility
SEED = 42
set_seed(SEED)

# Define the instruction format the model will learn
# This structure is critical for aligning the model to the dataset
def formatting_prompts_func(examples):
    """
    Applies the ChatML-style formatting to each instruction/output pair in the dataset.
    This teaches the model the required conversation structure.
    """
    output_texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        # Format follows a standardized SFT template (similar to ChatML or Alpaca)
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}###"
        output_texts.append(text)
    return output_texts

# -------------------------------------------------------------------
# 1. LOAD DATASET
# -------------------------------------------------------------------
print(f"--- 5. Sovereign Inoculation ---")
print(f"‚öôÔ∏è Loading dataset from {DATASET_FILE}...")
try:
    # Use load_dataset to handle the JSONL file
    dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
    # The dataset needs to contain the 'instruction' and 'output' columns
    print(f"‚úÖ Dataset loaded successfully. Total examples: {len(dataset)}")
except Exception as e:
    print(f"‚ùå ERROR loading dataset: {e}")
    exit()

# -------------------------------------------------------------------
# 2. QLORA CONFIGURATION (4-bit Quantization)
# -------------------------------------------------------------------
print(f"\n‚öôÔ∏è Setting up 4-bit QLoRA configuration...")

# Quantization configuration for loading the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized floating-point 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# -------------------------------------------------------------------
# 3. MODEL AND TOKENIZER LOADING
# -------------------------------------------------------------------
print(f"‚öôÔ∏è Loading base model: {BASE_MODEL}...")

# Load the base model with the quantization config
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Disable caching for training
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for Qwen models and QLoRA

print(f"‚úÖ Model and Tokenizer loaded.")

# -------------------------------------------------------------------
# 4. LORA ADAPTER CONFIGURATION
# -------------------------------------------------------------------
# LoRA (Low-Rank Adaptation) configuration
peft_config = LoraConfig(
    lora_alpha=16,          # Scaling factor for LoRA weights
    lora_dropout=0.1,       # Dropout probability
    r=64,                   # Rank of the update matrices
    bias="none",
    task_type="CAUSAL_LM",
    # Target specific Qwen2 attention layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# -------------------------------------------------------------------
# 5. TRAINING ARGUMENTS
# -------------------------------------------------------------------
print(f"\n‚öôÔ∏è Configuring training arguments...")

# Determine max sequence length based on data content
max_seq_length = 8192 # Max context length for Qwen2-7B is 32768, 8192 is safe for this data.

# Standard training arguments for SFT
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                # Number of epochs for training
    per_device_train_batch_size=2,     # Batch size per device (adjust based on GPU memory)
    gradient_accumulation_steps=4,     # Accumulate gradients over 4 steps (effective batch size 8)
    optim="paged_adamw_8bit",          # Optimized 8-bit optimizer for QLoRA
    save_steps=50,                     # Save checkpoint every 50 steps
    logging_steps=10,                  # Log metrics every 10 steps
    learning_rate=2e-4,                # Learning rate
    weight_decay=0.001,
    fp16=False,                        # Set to False, use bfloat16 for computation
    bf16=True,                         # Use bfloat16 for faster training on compatible GPUs
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,              # Speed up training by grouping similar length samples
    lr_scheduler_type="cosine",        # Cosine learning rate schedule
    report_to="none",                  # Disable external reporting
)

# -------------------------------------------------------------------
# 6. INITIALIZE SFT TRAINER
# -------------------------------------------------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field=None, # Not needed when using formatting_prompts_func
    formatting_func=formatting_prompts_func, # Pass the formatting function
    max_seq_length=max_seq_length,
    args=training_arguments,
)

# -------------------------------------------------------------------
# 7. EXECUTE FINE-TUNING
# -------------------------------------------------------------------
print("\nüî• **Starting Sovereign Inoculation (Fine-Tuning)** üî•")
print(f"Training for {training_arguments.num_train_epochs} epochs with effective batch size of {training_arguments.per_device_train_batch_size * training_arguments.gradient_accumulation_steps}...")

trainer.train()

# -------------------------------------------------------------------
# 8. SAVE FINAL ADAPTER
# -------------------------------------------------------------------
# Save the final LoRA adapter weights
final_adapter_path = os.path.join(OUTPUT_DIR, "final_adapter")
trainer.model.save_pretrained(final_adapter_path)
tokenizer.save_pretrained(final_adapter_path)
print(f"\n‚úÖ Fine-Tuning Complete! LoRA Adapter saved to: {final_adapter_path}")
print("Proceed to Cell 6 to merge the adapter and create the final Sanctuary Model.")



ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)