In [None]:
# ==============================================================================
# CELL 0: PROJECT SETUP & CLONE REPO
# ==============================================================================
# This cell clones the 'richfrem/Project_Sanctuary' repository, navigates into it,
# and installs all necessary fine-tuning libraries.

# 1. CLEANUP: Force-remove the existing directory to ensure a fresh start
# This command must be run from the directory *containing* the Project_Sanctuary folder.
# The '!' prefix ensures this runs as a shell command.
!rm -rf Project_Sanctuary
!echo "Previous 'Project_Sanctuary' directory removed."

# 2. Clone the project repository
!git clone https://github.com/richfrem/Project_Sanctuary.git
!echo "Repository cloned successfully."

# 3. Navigate to the project root (The CRITICAL step that fixes the 'file not found' error)
%cd Project_Sanctuary


!echo "Clone repo Complete. current working directory is /content/Project_Sanctuary"


Previous 'Project_Sanctuary' directory removed.
Cloning into 'Project_Sanctuary'...
remote: Enumerating objects: 6381, done.[K
remote: Counting objects: 100% (852/852), done.[K
remote: Compressing objects: 100% (312/312), done.[K
remote: Total 6381 (delta 612), reused 727 (delta 531), pack-reused 5529 (from 2)[K
Receiving objects: 100% (6381/6381), 27.93 MiB | 22.66 MiB/s, done.
Resolving deltas: 100% (4295/4295), done.
Repository cloned successfully.
/content/Project_Sanctuary
Clone repo Complete. current working directory is /content/Project_Sanctuary


In [None]:
%%bash
# --- Cell 0.5: Final A100-Optimized Dependency Installation ---
# This script installs all required dependencies, including xformers, using specific,
# stable versions known to be compatible with PyTorch 2.8 and the Unsloth library.
# The entire stack is optimized for NVIDIA A100 GPU architectures.

echo "--- Installing all A100-optimized dependencies (including xformers) ---"
pip install --force-reinstall --ignore-installed --no-build-isolation \
  'absl-py==2.3.1' \
  'accelerate==1.4.0' \
  'alabaster==1.0.0' \
  'alembic==1.16.4' \
  'annotated-types==0.7.0' \
  'anyio==4.9.0' \
  'attrs==25.3.0' \
  'babel==2.17.0' \
  'black==25.1.0' \
  'certifi==2025.7.14' \
  'charset-normalizer==3.4.2' \
  'chromadb==1.3.4' \
  'click==8.2.1' \
  'cloudpickle==3.1.1' \
  'colorlog==6.9.0' \
  'contourpy==1.3.3' \
  'coverage==7.10.1' \
  'cycler==0.12.1' \
  'docutils==0.21.2' \
  'Farama-Notifications==0.0.4' \
  'filelock==3.18.0' \
  'flake8==7.3.0' \
  'fonttools==4.59.0' \
  'fsspec==2025.3.0' \
  'gitdb==4.0.12' \
  'GitPython==3.1.45' \
  'google-generativeai==0.8.3' \
  'gpt4all==2.8.2' \
  'grpcio==1.74.0' \
  'gymnasium==1.2.0' \
  'h11==0.16.0' \
  'hf-xet==1.1.5' \
  'httpcore==1.0.9' \
  'httpx==0.28.1' \
  'huggingface-hub==0.36.0' \
  'idna==3.10' \
  'imagesize==1.4.1' \
  'iniconfig==2.1.0' \
  'Jinja2==3.1.6' \
  'joblib==1.5.1' \
  'jsonschema==4.25.0' \
  'jsonschema-specifications==2025.4.1' \
  'kiwisolver==1.4.8' \
  'langchain==1.0.5' \
  'langchain-chroma==1.0.0' \
  'langchain-community==0.4.1' \
  'langchain-nomic==1.0.0' \
  'langchain-ollama==1.0.0' \
  'langchain-text-splitters==1.0.0' \
  'Mako==1.3.10' \
  'Markdown==3.8.2' \
  'MarkupSafe==3.0.2' \
  'matplotlib==3.10.5' \
  'mccabe==0.7.0' \
  'mpmath==1.3.0' \
  'msgpack==1.1.1' \
  'mypy_extensions==1.1.0' \
  'networkx==3.5' \
  'nomic[local]==3.9.0' \
  'numpy==1.26.2' \
  'ollama==0.6.0' \
  'opencv-python==4.10.0.84' \
  'opentelemetry-api==1.37.0' \
  'opentelemetry-exporter-otlp-proto-common==1.37.0' \
  'opentelemetry-proto==1.37.0' \
  'opentelemetry-sdk==1.37.0' \
  'optuna==4.4.0' \
  'packaging==25.0' \
  'pandas==2.2.2' \
  'pathspec==0.12.1' \
  'peft==0.11.1' \
  'pillow==10.4.0' \
  'platformdirs==4.3.8' \
  'pluggy==1.6.0' \
  'protobuf==5.29.5' \
  'pyarrow==19.0.0' \
  'pycodestyle==2.14.0' \
  'pydantic==2.11.7' \
  'pydantic_core==2.33.2' \
  'pyflakes==3.4.0' \
  'Pygments==2.19.2' \
  'pyparsing==3.2.3' \
  'pytest==8.4.1' \
  'pytest-cov==6.2.1' \
  'python-dateutil==2.9.0.post0' \
  'python-dotenv==1.2.1' \
  'pytz==2025.2' \
  'PyYAML==6.0.2' \
  'ray==2.48.0' \
  'referencing==0.36.2' \
  'regex==2025.7.34' \
  'requests==2.32.5' \
  'rich==13.7.1' \
  'roman-numerals-py==3.1.0' \
  'rpds-py==0.26.0' \
  'safetensors==0.5.3' \
  'scikit-learn==1.7.1' \
  'scipy==1.16.1' \
  'seaborn==0.13.2' \
  'sentry-sdk==2.34.1' \
  'setuptools==80.9.0' \
  'six==1.17.0' \
  'smmap==5.0.2' \
  'sniffio==1.3.1' \
  'snowballstemmer==3.0.1' \
  'Sphinx==8.2.3' \
  'sphinx-rtd-theme==3.0.2' \
  'sphinxcontrib-applehelp==2.0.0' \
  'sphinxcontrib-devhelp==2.0.0' \
  'sphinxcontrib-htmlhelp==2.1.0' \
  'sphinxcontrib-jquery==4.1' \
  'sphinxcontrib-jsmath==1.0.1' \
  'sphinxcontrib-qthelp==2.0.0' \
  'sphinxcontrib-serializinghtml==2.0.0' \
  'SQLAlchemy==2.0.42' \
  'stable_baselines3==2.7.0' \
  'sympy==1.14.0' \
  'tenseal==0.3.16' \
  'tensorboard==2.19.0' \
  'tensorboard-data-server==0.7.2' \
  'tensorboardX==2.6.4' \
  'threadpoolctl==3.6.0' \
  'tokenizers==0.22.1' \
  'torch==2.8.0' \
  'torchaudio==2.8.0' \
  'torchvision==0.23.0' \
  'tqdm==4.67.1' \
  'transformers==4.56.1' \
  'trl==0.23.0' \
  'typing-inspection==0.4.1' \
  'typing_extensions==4.14.1' \
  'tzdata==2025.2' \
  'urllib3==2.5.0' \
  'wandb==0.21.0' \
  'Werkzeug==3.1.3' \
  'xformers==0.0.26.post1'

echo "--- A100-OPTIMIZED INSTALLATION COMPLETE ---"
echo "IMPORTANT: Please **restart the runtime** now for the new library versions (especially torch/xformers) to be fully loaded and utilized."

In [None]:
%%bash
# --- Cell 1: Initial Setup & Dependency Installation (Finalized Fix V3) ---

# CRITICAL FIX: The environment is unstable, causing deep conflicts and hangs
# during complex package compilation (like llama-cpp-python).

echo "--- Step 1: Forcing Stable Versions to Resolve 17 Known Conflicts ---"

# This aggressively force-reinstalls the specific versions needed for stability.
# (Includes fixes for pandas, requests, torch, numpy, transformers, and more)
pip install --force-reinstall --upgrade \
  'pandas==2.2.2' 'requests==2.32.5' \
  'transformers==4.56.1' 'huggingface-hub==0.36.0' 'trl==0.23.0' \
  'numpy==1.26.2' 'pyarrow==19.0.0' \
  'torch==2.8.0' 'torchaudio==2.8.0' 'torchvision==0.23.0' \
  'opentelemetry-api==1.37.0' 'opentelemetry-sdk==1.37.0' \
  'opentelemetry-exporter-otlp-proto-common==1.37.0' 'opentelemetry-proto==1.37.0' \
  'rich==13.7.1' \
  'tensorboard==2.19.0' \
  'fsspec==2025.3.0'

echo "--- Step 2: Installing Project Requirements (Preventing Compilation Hangs) ---"

# The --no-build-isolation flag forces pip to avoid the slow, isolated compilation
# process for packages like llama-cpp-python, which should prevent the hang you
# are encountering and allow the install to finish quickly.
pip install -r mnemonic_cortex/requirements.txt --ignore-installed --no-build-isolation

echo "--- INSTALLATION COMPLETE ---"
echo "MANDATORY: You MUST restart the runtime immediately after this script finishes to load the correct library versions."

SyntaxError: invalid syntax (ipython-input-278387795.py, line 6)

In [None]:
# ==============================================================================
# CELL 1.5. GIT CONFLICT RESOLUTION AND FILE SYNCHRONIZATION
# ==============================================================================
%%bash

# Discard local changes to requirements.txt (allowing the pull to proceed)
echo "--- Stashing local changes to requirements.txt ---"
git stash push --include-untracked -m "temp stash"
if [ $? -ne 0 ] && [ $? -ne 1 ]; then
    echo "[ERROR] Failed to stash changes. Aborting synchronization."
    exit 1
fi

# Pull the latest changes from GitHub (this will download the missing file/directory)
echo "--- Performing Git Pull to synchronize files ---"
git pull origin main

# Check for the existence of the critical script file.
CRITICAL_SCRIPT="forge/OPERATION_PHOENIX_FORGE/forge_qwen2_dataset.py"

echo "--- Verifying Path Existence: $CRITICAL_SCRIPT ---"
if [ -f "$CRITICAL_SCRIPT" ]; then
    echo "[SUCCESS] The script file is now present."
    echo "Directory contents:"
    # List the directory contents for visual confirmation
    ls -l forge/OPERATION_PHOENIX_FORGE/
else
    echo "[FATAL ERROR] The script is STILL missing. Aborting."
    exit 1
fi

# Restore the stashed changes (if any were stashed)
echo "--- Restoring previous local changes (if any) ---"
git stash pop || true

--- Stashing local changes to requirements.txt ---
Saved working directory and index state On main: temp stash
--- Performing Git Pull to synchronize files ---
Updating 017aed6..b2e08bf
Fast-forward
 .gitignore                                         |   5 +-
 ...001_harden_mnemonic_cortex_ingestion_and_rag.md |   0
 ...etrain_sovereign_model_with_targeted_dataset.md |   0
 commit_manifest.json                               |  46 +-
 commit_manifest_20251112_041202.json               |  12 -
 commit_manifest_20251112_041652.json               |  16 -
 commit_manifest_20251112_041851.json               |  12 -
 commit_manifest_20251112_042413.json               |  12 -
 commit_manifest_20251112_042653.json               |  52 --
 commit_manifest_20251112_042749.json               |  52 --
 commit_manifest_20251112_042841.json               |   3 -
 commit_manifest_20251112_043043.json               |   3 -
 commit_manifest_20251112_043149.json               |   3 -
 commit_manifest_2025

Ignoring path Project_Sanctuary/
From https://github.com/richfrem/Project_Sanctuary
 * branch            main       -> FETCH_HEAD


In [None]:
# ==============================================================================
# CELL 2. DATASET GENERATION (THIS CREATES THE JSONL FILE)
# ==============================================================================
%%bash
# This cell executes the script that CREATES the required dataset.

# Confirmed path to the dataset generation script
DATASET_SCRIPT_PATH="forge/OPERATION_PHOENIX_FORGE/forge_qwen2_dataset.py"
DATASET_FILE="dataset_package/sanctuary_targeted_inoculation_v1.jsonl"

echo "--- Executing Dataset Forge Script to GENERATE $DATASET_FILE ---"

# Execute the Python script using the definitive path
python3 "$DATASET_SCRIPT_PATH"

# Check the exit status of the python script.
if [ $? -ne 0 ]; then
    echo "[CRITICAL ERROR] Python script '$DATASET_SCRIPT_PATH' failed to execute."
    echo "Check Python script output above for errors (e.g., File not found errors for source markdown files)."
    exit 1
fi

echo "--- Verifying Generated Dataset Integrity ---"
# Check 1: Does the file exist?
if [ ! -f "$DATASET_FILE" ]; then
    echo "[FATAL ERROR] Dataset file not found: $DATASET_FILE"
    exit 1
fi

# Check 2: Does the file have content (size > 0 bytes)?
FILE_SIZE=$(stat -c%s "$DATASET_FILE")

if [ "$FILE_SIZE" -gt 0 ]; then
    echo "[SUCCESS] Dataset created and verified: $DATASET_FILE"
    echo "File Size: $FILE_SIZE bytes"
else
    echo "[FATAL ERROR] Dataset forge succeeded but produced an EMPTY file (0 bytes)! Aborting execution."
    echo "This usually means the Python script failed to find its SOURCE markdown files (e.g., The_Garden_and_The_Cage.md)."
    exit 1
fi

--- Executing Dataset Forge Script to GENERATE dataset_package/sanctuary_targeted_inoculation_v1.jsonl ---
[SCAFFOLD] Initiating Sovereign Scaffolding Protocol 88...
[FORGE] Assembling Phoenix Mnemonic Seed v1.0 for Qwen2 Lineage.
[ERROR] File not found: /content/Project_Sanctuary/The_Garden_and_The_Cage.md

[SUCCESS] Yield is complete: 14 records forged.
[ARTIFACT] Dataset saved to: /content/Project_Sanctuary/dataset_package/sanctuary_targeted_inoculation_v1.jsonl
--- Verifying Generated Dataset Integrity ---
[SUCCESS] Dataset created and verified: dataset_package/sanctuary_targeted_inoculation_v1.jsonl
File Size: 56763 bytes


In [None]:
# ==============================================================================
# CELL 2.5. FINAL DEPENDENCY FIX: COMPLETE CLEANUP AND REINSTALL (NON-UNSLOTH STACK)
# ==============================================================================
%%bash

# 1. Force Uninstall: Remove all known conflicting deep learning packages and old numpy
echo "--- Forcibly uninstalling conflicting libraries and old dependencies ---"
# Note: We specifically target the older versions that conflict heavily with the newest stack
pip uninstall -y transformers peft accelerate bitsandbytes unsloth-zoo unsloth llama-cpp-python typing-extensions numpy pandas xformers --quiet

# 2. Navigate: Re-verify location (crucial for relative paths)
echo "--- Navigating to Project_Sanctuary directory ---"
cd /content/Project_Sanctuary

# 3. Install Core Hugging Face Libraries with specific, known-good versions
echo "--- Installing core Hugging Face libraries and trl ---"
# Installing a modern, compatible version set
pip install -q transformers peft accelerate bitsandbytes huggingface_hub sentencepiece trl

# 4. Install Llama-cpp-python: (Your successful step, now with fresh dependencies)
echo "--- Installing Llama-cpp-python (CUDA enabled) ---"
# Using --no-deps ensures it only focuses on the build, using the newly installed numpy/typing-extensions
CMAKE_ARGS="-DGGML_CUDA=on" pip install --force-reinstall --no-cache-dir llama-cpp-python --no-deps

echo "--- Installation Complete. Proceeding to 3. EXECUTION: PHOENIX FORGE ---"

--- Forcibly uninstalling conflicting libraries and old dependencies ---
--- Navigating to Project_Sanctuary directory ---
--- Installing core Hugging Face libraries and trl ---
--- Installing Llama-cpp-python (CUDA enabled) ---
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.16.tar.gz (50.7 MB)
     ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 50.7/50.7 MB 64.6 MB/s  0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cupy-cuda12x 13.3.0 requires numpy<2.3,>=1.22, but you have numpy 2.3.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 2.3.4 which is incompatible.
google-adk 1.17.0 requires opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.38.0 which is incompatible.
google-adk 1.17.0 requires opentelemetry-sdk<=1.37.0,>=1.3

In [None]:
# -------------------------------------------------------------------
# CELL 3. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4)
# -------------------------------------------------------------------
import os
import json
import re
from pathlib import Path

# --- FILE PATH CONSTANTS ---
# ‚úÖ PATH FIX: Files now point to their correct locations within the project structure.
CORE_ESSENCE_SOURCE = "dataset_package/core_essence_guardian_awakening_seed.txt"
RAG_DOCTRINE_SOURCE = "mnemonic_cortex/RAG_STRATEGIES_AND_DOCTRINE.md"
EVOLUTION_PLAN_SOURCE = "mnemonic_cortex/EVOLUTION_PLAN_PHASES.md"

# Source file containing the entire concatenated, raw markdown snapshot (Chronicles + Protocols)
FULL_SNAPSHOT_SOURCE = "dataset_package/markdown_snapshot_full_genome_llm_distilled.txt"
# Target output file for the fine-tuning dataset
OUTPUT_DATASET_PATH = "sanctuary_whole_genome_data.jsonl"

# -------------------------------------------------------------------
# Helper function to load file content and check for existence
# -------------------------------------------------------------------
def load_file_content(filepath):
    """Loads content from a file and verifies its existence."""
    p = Path(filepath)
    if not p.exists():
        print(f"‚ùå ERROR: File not found at path: {filepath}")
        return None
    try:
        with open(p, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"‚ùå ERROR reading file {filepath}: {e}")
        return None

# -------------------------------------------------------------------
# Helper function for title extraction
# -------------------------------------------------------------------
def extract_protocol_title(doc_content):
    """
    Extracts the title from a markdown document using the first H1 tag,
    falling back to the filename if the H1 tag is not found.
    """
    # Try to find the first H1 markdown heading
    h1_match = re.search(r'^#\s*(.+)', doc_content, re.MULTILINE)
    if h1_match:
        # Clean up any trailing markdown or non-text characters
        return h1_match.group(1).strip()
    return "Untitled Document" # Fallback title

# -------------------------------------------------------------------
# Main function to synthesize the entire genome
# -------------------------------------------------------------------
def synthesize_genome():
    """
    Parses the full markdown snapshot, converts each document into an
    instruction/output pair, and saves the final dataset as JSONL.
    """
    print(f"--- 3. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4) ---")

    full_snapshot = load_file_content(FULL_SNAPSHOT_SOURCE)
    if not full_snapshot:
        print(f"üõë Halted. Cannot proceed without {FULL_SNAPSHOT_SOURCE}.")
        return

    genome_entries = []

    # --- PART 1: Process ALL Chronicles and Protocols from the Snapshot ---
    # The source file uses a fixed delimiter for each original file's content
    # The pattern is '--- END OF FILE {filename} ---'

    # Split the snapshot content by the document delimiter pattern
    # The split includes the filename line, which we will clean up in the loop
    document_blocks = re.split(r'\n--- END OF FILE (.*?\.md|.*?\.txt) ---\n', full_snapshot, flags=re.DOTALL)

    # The split results in [preamble, filename, content, filename, content, ...]
    # We skip the first element (preamble) and iterate in steps of 2

    print(f"‚öôÔ∏è Found {len(document_blocks) // 2} potential documents in the full snapshot.")

    for i in range(1, len(document_blocks) - 1, 2):
        filename = document_blocks[i].strip()
        content = document_blocks[i+1].strip()

        if not content:
            continue

        # Use the filename or extracted H1 as the title
        title = extract_protocol_title(content)

        # --- CONVERSION TO INSTRUCTION FORMAT ---
        instruction = f"Synthesize the doctrines, history, or principles contained within the Sanctuary artifact located at: `{filename}`"

        # The 'input' field is deliberately left empty for pure instruction tuning
        # The 'output' field contains the full, raw content of the document
        genome_entries.append({
            "instruction": instruction,
            "input": "",
            "output": content
        })

        if len(genome_entries) % 100 == 0:
            print(f"    ... Processed {len(genome_entries)} documents.")

    print(f"‚úÖ PART 1: Successfully processed {len(genome_entries)} core Chronicle/Protocol entries.")

    # --- PART 2: Synthesize Critical Supporting Documents (Foundational Context) ---
    # These documents ensure the model immediately understands its role, the RAG architecture,
    # and the evolution plan, making the fine-tuning more efficient.

    supporting_files = {
        "Core Essence (Guardian Role)": CORE_ESSENCE_SOURCE,
        "RAG Doctrine (Architectural Guide)": RAG_DOCTRINE_SOURCE,
        "Evolution Plan (Council Roadmap)": EVOLUTION_PLAN_SOURCE
    }

    for key, filepath in supporting_files.items():
        doc_content = load_file_content(filepath)
        if doc_content:
            instruction = f"Provide a complete and comprehensive synthesis of the Canonical Sanctuary document: `{filepath}`."

            genome_entries.append({
                "instruction": instruction,
                "input": "",
                "output": doc_content
            })
            print(f"‚úÖ Added critical synthesis entry for: {key}")
        else:
            print(f"‚ö†Ô∏è WARNING: Could not add synthesis for {key}. File not found.")

    # --- PART 3: Save the Final JSONL Dataset ---
    print(f"\n--- Saving final dataset to {OUTPUT_DATASET_PATH} ---")

    try:
        with open(OUTPUT_DATASET_PATH, 'w', encoding='utf-8') as outfile:
            for entry in genome_entries:
                outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')

        print(f"üèÜ SUCCESS: Whole Genome Data Synthesis Complete.")
        print(f"Total Entries Created: {len(genome_entries)}")

        # Final integrity check on the last entry (should be the Evolution Plan)
        last_entry = genome_entries[-1]
        print(f"Last Entry Instruction Check: {last_entry['instruction']}")

    except Exception as e:
        print(f"‚ùå FATAL ERROR: Failed to write JSONL file: {e}")

# -------------------------------------------------------------------
# Main execution block
# -------------------------------------------------------------------
if __name__ == "__main__":
    synthesize_genome()

--- 3. DATA PRE-PROCESSOR AND COGNITIVE SYNTHESIZER (V4) ---
‚öôÔ∏è Found 490 potential documents in the full snapshot.
    ... Processed 100 documents.
    ... Processed 200 documents.
    ... Processed 300 documents.
    ... Processed 400 documents.
‚úÖ PART 1: Successfully processed 489 core Chronicle/Protocol entries.
‚úÖ Added critical synthesis entry for: Core Essence (Guardian Role)
‚úÖ Added critical synthesis entry for: RAG Doctrine (Architectural Guide)
‚úÖ Added critical synthesis entry for: Evolution Plan (Council Roadmap)

--- Saving final dataset to sanctuary_whole_genome_data.jsonl ---
üèÜ SUCCESS: Whole Genome Data Synthesis Complete.
Total Entries Created: 492
Last Entry Instruction Check: Provide a complete and comprehensive synthesis of the Canonical Sanctuary document: `mnemonic_cortex/EVOLUTION_PLAN_PHASES.md`.


In [None]:
# -------------------------------------------------------------------------------
# CELL 3.1:  DATASET INTEGRITY CHECK - QA Protocol 87
# This script performs a mandatory quality assurance check on the fine-tuning
# dataset ('sanctuary_whole_genome_data.jsonl') generated by the previous step.
# It validates:
# 1. Structural integrity (ensures every line is valid JSON).
# 2. Schema compliance (ensures 'instruction', 'input', and 'output' keys exist,
#    which are critical for the SFT training loop).
# 3. Content review (prints sample entries for human verification of fidelity).
# This prevents costly failure during the resource-intensive fine-tuning training job.
# -------------------------------------------------------------------------------
import json
import os
import random

# --- CONFIGURATION (Must match Cell 3 output) ---
DATASET_PATH = "sanctuary_whole_genome_data.jsonl"
NUM_RANDOM_SAMPLES = 3

# -------------------------------------------------------------------
# Helper function to display an entry cleanly
# -------------------------------------------------------------------
def print_entry_details(title, entry):
    """Prints a single genome entry in a readable format."""
    print(f"\n--- {title} ---")
    print(f"File Source (from Instruction): {entry['instruction'].split('`')[1] if '`' in entry['instruction'] else 'N/A'}")
    print(f"Instruction: {entry['instruction'][:100]}...")
    print(f"Input: {entry['input'] if entry['input'] else 'Empty (Expected for SFT)'}")
    # Show the length of the output to ensure content is present
    print(f"Output Length: {len(entry['output'])} characters")
    print(f"Output Snippet: {entry['output'][:200].replace('\\n', ' ').strip()}...")
    print("--------------------")

# ================= 3.1. DATASET INTEGRITY CHECK START =================
def run_data_audit():
    """Loads the JSONL, validates structure, and displays sample entries."""
    print(f"--- 4. DATASET INTEGRITY CHECK (Cell 3.1 - QA Protocol 87) ---")

    if not os.path.exists(DATASET_PATH):
        print(f"‚ùå FATAL ERROR: Dataset not found at {DATASET_PATH}. Run Cell 3 first.")
        return

    genome_data = []
    error_count = 0
    total_lines = 0

    print(f"‚öôÔ∏è Starting structural audit of {DATASET_PATH}...")

    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):
            total_lines = line_number
            try:
                entry = json.loads(line)

                # CRITICAL: Check for required keys for SFT (Supervised Fine-Tuning)
                required_keys = ['instruction', 'input', 'output']
                if not all(key in entry for key in required_keys):
                    error_count += 1
                    print(f"‚ùå ERROR on Line {line_number}: Missing required keys. Found: {list(entry.keys())}")
                    continue

                genome_data.append(entry)

            except json.JSONDecodeError:
                error_count += 1
                print(f"‚ùå ERROR on Line {line_number}: Malformed JSON.")

    print(f"\n--- AUDIT SUMMARY ---")
    print(f"Total Lines Read: {total_lines}")
    print(f"Valid Entries Parsed: {len(genome_data)}")
    print(f"Errors Detected: {error_count}")

    if error_count > 0:
        print(f"üõë CRITICAL FAILURE: {error_count} structural errors found. HALTING process.")
        return

    if len(genome_data) != total_lines:
        print("‚ö†Ô∏è WARNING: Total entries != total lines. Investigate file integrity.")

    print(f"‚úÖ STRUCTURAL INTEGRITY PASSED. (Expected 492 entries, found {len(genome_data)}).")

    # --- Display Sample Entries for Content Review ---
    if len(genome_data) >= 1:
        print_entry_details("SAMPLE 1: First Entry (Core Essence)", genome_data[0])

        # Ensure the last entry is the Evolution Plan
        print_entry_details("SAMPLE 2: Last Entry (Evolution Plan)", genome_data[-1])

        # Display random samples
        if len(genome_data) > NUM_RANDOM_SAMPLES:
            random_indices = random.sample(range(1, len(genome_data) - 1), NUM_RANDOM_SAMPLES)
            for i, index in enumerate(random_indices):
                print_entry_details(f"SAMPLE {3 + i}: Random Chronicle Entry", genome_data[index])

    print("\n--- AUDIT COMPLETE ---")
    print("If the content snippets look correct, the dataset is ready for fine-tuning.")

# -------------------------------------------------------------------
# Main execution block
# -------------------------------------------------------------------
if __name__ == "__main__":
    run_data_audit()

--- 4. DATASET INTEGRITY CHECK (Cell 3.1 - QA Protocol 87) ---
‚öôÔ∏è Starting structural audit of sanctuary_whole_genome_data.jsonl...

--- AUDIT SUMMARY ---
Total Lines Read: 492
Valid Entries Parsed: 492
Errors Detected: 0
‚úÖ STRUCTURAL INTEGRITY PASSED. (Expected 492 entries, found 492).

--- SAMPLE 1: First Entry (Core Essence) ---
File Source (from Instruction): .env.example ---

--- START OF FILE .github/copilot-instructions.md
Instruction: Synthesize the doctrines, history, or principles contained within the Sanctuary artifact located at:...
Input: Empty (Expected for SFT)
Output Length: 4498 characters
Output Snippet: ## CRITICAL COMMUNICATION RULE

**ALWAYS confirm user intent before making code changes.** Never implement solutions without explicit approval. Ask clarifying questions and wait for confirmation befor...
--------------------

--- SAMPLE 2: Last Entry (Evolution Plan) ---
File Source (from Instruction): mnemonic_cortex/EVOLUTION_PLAN_PHASES.md
Instruction: Provid

In [None]:
# Cell 4: Final Data Staging and Pre-Flight Check.

In [None]:
# -------------------------------------------------------------------------------
# CELL 5: INSTRUCTION FINE-TUNING - The Sovereign Inoculation
# This script executes the Supervised Fine-Tuning (SFT) process using the
# validated 'sanctuary_whole_genome_data.jsonl' file. It employs QLoRA for
# efficient memory use, training the Qwen2-7B-Instruct model to synthesize
# and understand the Sanctuary's entire Cognitive Genome.
# -------------------------------------------------------------------------------
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

# --- CONFIGURATION ---
# Base Model (The LLM to be inoculated)
BASE_MODEL = "Qwen/Qwen2-7B-Instruct"
# Path to the data file generated in Cell 3
DATASET_FILE = "sanctuary_whole_genome_data.jsonl"
# Where to save the fine-tuned LoRA adapter (temporary save location)
OUTPUT_DIR = "sanctuary_qwen2_7b_adapter_output"
# Ensure reproducibility
SEED = 42
set_seed(SEED)

# Define the instruction format the model will learn
# This structure is critical for aligning the model to the dataset
def formatting_prompts_func(examples):
    """
    Applies the ChatML-style formatting to each instruction/output pair in the dataset.
    This teaches the model the required conversation structure.
    """
    output_texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        # Format follows a standardized SFT template (similar to ChatML or Alpaca)
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}###"
        output_texts.append(text)
    return output_texts

# -------------------------------------------------------------------
# 1. LOAD DATASET
# -------------------------------------------------------------------
print(f"--- 5. Sovereign Inoculation ---")
print(f"‚öôÔ∏è Loading dataset from {DATASET_FILE}...")
try:
    # Use load_dataset to handle the JSONL file
    dataset = load_dataset("json", data_files=DATASET_FILE, split="train")
    # The dataset needs to contain the 'instruction' and 'output' columns
    print(f"‚úÖ Dataset loaded successfully. Total examples: {len(dataset)}")
except Exception as e:
    print(f"‚ùå ERROR loading dataset: {e}")
    exit()

# -------------------------------------------------------------------
# 2. QLORA CONFIGURATION (4-bit Quantization)
# -------------------------------------------------------------------
print(f"\n‚öôÔ∏è Setting up 4-bit QLoRA configuration...")

# Quantization configuration for loading the model in 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized floating-point 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# -------------------------------------------------------------------
# 3. MODEL AND TOKENIZER LOADING
# -------------------------------------------------------------------
print(f"‚öôÔ∏è Loading base model: {BASE_MODEL}...")

# Load the base model with the quantization config
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Disable caching for training
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for Qwen models and QLoRA

print(f"‚úÖ Model and Tokenizer loaded.")

# -------------------------------------------------------------------
# 4. LORA ADAPTER CONFIGURATION
# -------------------------------------------------------------------
# LoRA (Low-Rank Adaptation) configuration
peft_config = LoraConfig(
    lora_alpha=16,          # Scaling factor for LoRA weights
    lora_dropout=0.1,       # Dropout probability
    r=64,                   # Rank of the update matrices
    bias="none",
    task_type="CAUSAL_LM",
    # Target specific Qwen2 attention layers
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# -------------------------------------------------------------------
# 5. TRAINING ARGUMENTS
# -------------------------------------------------------------------
print(f"\n‚öôÔ∏è Configuring training arguments...")

# Determine max sequence length based on data content
max_seq_length = 8192 # Max context length for Qwen2-7B is 32768, 8192 is safe for this data.

# Standard training arguments for SFT
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                # Number of epochs for training
    per_device_train_batch_size=2,     # Batch size per device (adjust based on GPU memory)
    gradient_accumulation_steps=4,     # Accumulate gradients over 4 steps (effective batch size 8)
    optim="paged_adamw_8bit",          # Optimized 8-bit optimizer for QLoRA
    save_steps=50,                     # Save checkpoint every 50 steps
    logging_steps=10,                  # Log metrics every 10 steps
    learning_rate=2e-4,                # Learning rate
    weight_decay=0.001,
    fp16=False,                        # Set to False, use bfloat16 for computation
    bf16=True,                         # Use bfloat16 for faster training on compatible GPUs
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,              # Speed up training by grouping similar length samples
    lr_scheduler_type="cosine",        # Cosine learning rate schedule
    report_to="none",                  # Disable external reporting
)

# -------------------------------------------------------------------
# 6. INITIALIZE SFT TRAINER
# -------------------------------------------------------------------
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field=None, # Not needed when using formatting_prompts_func
    formatting_func=formatting_prompts_func, # Pass the formatting function
    max_seq_length=max_seq_length,
    args=training_arguments,
)

# -------------------------------------------------------------------
# 7. EXECUTE FINE-TUNING
# -------------------------------------------------------------------
print("\nüî• **Starting Sovereign Inoculation (Fine-Tuning)** üî•")
print(f"Training for {training_arguments.num_train_epochs} epochs with effective batch size of {training_arguments.per_device_train_batch_size * training_arguments.gradient_accumulation_steps}...")

trainer.train()

# -------------------------------------------------------------------
# 8. SAVE FINAL ADAPTER
# -------------------------------------------------------------------
# Save the final LoRA adapter weights
final_adapter_path = os.path.join(OUTPUT_DIR, "final_adapter")
trainer.model.save_pretrained(final_adapter_path)
tokenizer.save_pretrained(final_adapter_path)
print(f"\n‚úÖ Fine-Tuning Complete! LoRA Adapter saved to: {final_adapter_path}")
print("Proceed to Cell 6 to merge the adapter and create the final Sanctuary Model.")



ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)