<a href="https://colab.research.google.com/github/rdntmsn/Harmony360/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 1 – Environment setup               ║
# ╚══════════════════════════════════════════╝
!pip install -q transformers datasets loguru
!pip install -q --upgrade transformers

from google.colab import drive
drive.mount('/content/drive')

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
from loguru import logger
import os, json, shutil
from datetime import datetime
from pathlib import Path

logger.info("✅ Environment ready!")

[32m2025-07-17 23:32:14.435[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m17[0m - [1m✅ Environment ready![0m


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 2 – Har360sys folder scaffold       ║
# ╚══════════════════════════════════════════╝
base_dir = "/content/drive/MyDrive/Harmony360/Har360sys"

folders = ["orchestrator","registry","pipelines",
           "snapshots/latest","snapshots/archived","utils"]
files = {
    "Har360sys_README.md": "# Har360sys\nCentral orchestration system for Harmony360.\n",
    "registry/har360_file_registry.json": json.dumps({"files": []}, indent=4),
    "registry/guardian_state_registry.json": json.dumps({"guardians": []}, indent=4),
    "registry/training_run_history.json": json.dumps({"runs": []}, indent=4),
    "utils/logger_har360.py": "# Logger utilities for Har360sys\n",
    "utils/json_safety.py": "# JSON safety & validation utilities\n",
}

os.makedirs(base_dir, exist_ok=True)
for folder in folders:
    os.makedirs(os.path.join(base_dir, folder), exist_ok=True)
for rel_path, content in files.items():
    full_path = os.path.join(base_dir, rel_path)
    if not os.path.exists(full_path):
        with open(full_path, "w", encoding="utf-8") as f:
            f.write(content)

logger.success("🎯 Har360sys structure is ready!")

[32m2025-07-17 23:32:35.697[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m26[0m - [32m[1m🎯 Har360sys structure is ready![0m


In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 3 – Scan har360_output & update reg ║
# ╚══════════════════════════════════════════╝
source_dir   = "/content/drive/MyDrive/Harmony360/har360_output"
registry_file = os.path.join(base_dir, "registry", "har360_file_registry.json")

with open(registry_file, "r", encoding="utf-8") as f:
    registry = json.load(f)

added = 0
for root, _, files in os.walk(source_dir):
    for file in files:
        if file.endswith(".har360"):
            full_path = os.path.join(root, file)
            size_kb   = round(os.stat(full_path).st_size / 1024, 2)

            if not any(f["name"] == file for f in registry["files"]):
                registry["files"].append({
                    "name": file,
                    "path": full_path,
                    "size_kb": size_kb,
                    "synced_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                added += 1

with open(registry_file, "w", encoding="utf-8") as f:
    json.dump(registry, f, indent=4)

logger.success(f"✅ Registry updated → {added} new files logged.")

[32m2025-07-17 23:32:44.945[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m29[0m - [32m[1m✅ Registry updated → 0 new files logged.[0m


In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 4 – Validate JSON & quarantine bad  ║
# ╚══════════════════════════════════════════╝
HAR360SYS_DIR  = base_dir
REGISTRY_FILE  = registry_file
QUARANTINE_DIR = os.path.join(HAR360SYS_DIR, "quarantine")
Path(QUARANTINE_DIR).mkdir(parents=True, exist_ok=True)

with open(REGISTRY_FILE, "r", encoding="utf-8") as f:
    registry = json.load(f)

logger.info(f"📦 Loaded registry → {len(registry['files'])} total files.")

validated = quarantined = missing = 0

for entry in registry["files"]:
    file_path = entry["path"]
    file_name = entry["name"]

    # Skip already quarantined
    if entry.get("status") == "quarantined":
        continue

    # Mark missing
    if not os.path.exists(file_path):
        entry.update({
            "status": "missing",
            "missing_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        missing += 1
        logger.warning(f"⚠ Missing file (marked in registry): {file_name}")
        continue

    # Validate JSON
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            json.load(f)
        validated += 1

    except Exception as e:
        # Quarantine bad JSON
        quarantine_path = os.path.join(
            QUARANTINE_DIR,
            f"{Path(file_name).stem}_quarantined_{datetime.now().strftime('%Y%m%d_%H%M%S')}.har360"
        )
        shutil.move(file_path, quarantine_path)
        entry.update({
            "status": "quarantined",
            "quarantined_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "error": str(e)
        })
        quarantined += 1
        logger.error(f"❌ Quarantined: {file_name} → {quarantine_path}")

# Save registry
with open(REGISTRY_FILE, "w", encoding="utf-8") as f:
    json.dump(registry, f, indent=4)

logger.success(
    f"\n🎯 Validation Complete → ✅ {validated} valid | 🛑 {quarantined} quarantined | ⚠ {missing} missing."
)
logger.info(f"📄 Updated registry saved at: {REGISTRY_FILE}")

[32m2025-07-17 23:33:06.787[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m12[0m - [1m📦 Loaded registry → 403 total files.[0m
[32m2025-07-17 23:33:09.739[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m59[0m - [32m[1m
🎯 Validation Complete → ✅ 287 valid | 🛑 0 quarantined | ⚠ 1 missing.[0m
[32m2025-07-17 23:33:09.740[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m62[0m - [1m📄 Updated registry saved at: /content/drive/MyDrive/Harmony360/Har360sys/registry/har360_file_registry.json[0m


In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 5 – Build HF Dataset (fixed filter) ║
# ╚══════════════════════════════════════════╝
# **Fix:** skip both 'quarantined' *and* 'missing' entries
valid_files = [
    f for f in registry["files"]
    if f.get("status") not in ("quarantined", "missing")
       and os.path.exists(f["path"])
]

records = []
for entry in valid_files:
    try:
        with open(entry["path"], "r", encoding="utf-8") as f:
            data = json.load(f)
        records.append({"text": json.dumps(data)})
    except Exception as e:
        logger.error(f"⚠ Unexpected issue: {entry['name']} → {e}")

dataset = Dataset.from_list(records)
logger.success(f"🎯 Clean Dataset Ready → {len(dataset)} samples")

# Preview first item (safe even if dataset empty)
if len(dataset) > 0:
    print("\n🔍 Preview Sample:\n", dataset[0])
else:
    print("Dataset is empty ‑ please check source files.")

[32m2025-07-17 23:33:35.420[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m21[0m - [32m[1m🎯 Clean Dataset Ready → 287 samples[0m



🔍 Preview Sample:
 {'text': '{"content": "The image is a complex, circular diagram titled \\"Plasmoid Unification Model\\" (MOLTEN SEA ARK ATOMIC RECONSTRUCTION TECHNOLOGY - MSAART). It appears to be a conceptual model integrating physics, cosmology, and metaphysics, focusing on energy transformation, frequencies, and dimensions.\\nKey Elements in the Image:\\nCircular Structure:\\nThe diagram is divided into multiple concentric rings with numerical values, labels, and colors, representing different layers of data related to energy, matter, time, and frequencies.\\nEnergy & Dimensional Relationships:\\nThe image connects concepts of Aether (Light, Energy) - 6D, Sun (5D), Time (4D), and Matter (3D) with numerical values assigned to each.\\nIt suggests an interplay between Aether (DC - Direct Current) and Matter (AC - Alternating Current) in energy conversion.\\nScientific & Esoteric References:\\nIt includes relationships between the Sun, Earth, and Moon, as well as their frequencies a

In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 6 – (OPTIONAL) Save dataset to disk ║
# ╚══════════════════════════════════════════╝
out_dir = "/content/drive/MyDrive/Harmony360/datasets"
Path(out_dir).mkdir(parents=True, exist_ok=True)

ts      = datetime.now().strftime("%Y%m%d_%H%M%S")
out_fp  = os.path.join(out_dir, f"h360_dataset_{ts}.jsonl")

with open(out_fp, "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item) + "\n")

logger.success(f"💾 Dataset saved → {out_fp}")

[32m2025-07-17 23:34:07.369[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m14[0m - [32m[1m💾 Dataset saved → /content/drive/MyDrive/Harmony360/datasets/h360_dataset_20250717_233359.jsonl[0m


In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 7 – Build Clean HF Dataset v3       ║
# ╚══════════════════════════════════════════╝
from datasets import Dataset

logger.info("🚀 Starting Clean Dataset Builder v3 (safe-normalized)...")

valid_files = [
    f for f in registry["files"]
    if f.get("status") not in ("quarantined", "missing")
    and os.path.exists(f["path"])
]

records = []
skipped = 0

for entry in valid_files:
    try:
        with open(entry["path"], "r", encoding="utf-8") as f:
            data = json.load(f)

        # ✅ Normalize any dict/list into text for training
        content = data.get("content", "")
        if isinstance(content, (dict, list)):
            content = json.dumps(content, indent=2)

        if isinstance(content, str) and content.strip():
            records.append({"text": content})
        else:
            skipped += 1
            logger.warning(f"⚠ Skipped (no valid text): {entry['name']}")

    except Exception as e:
        skipped += 1
        logger.error(f"❌ Failed to load {entry['name']} → {e}")

dataset = Dataset.from_list(records)
logger.success(f"🎯 Clean Dataset Ready → {len(dataset)} samples | ⚠ Skipped {skipped}")

if len(dataset) > 0:
    print("\n🔍 Preview Sample:\n", dataset[0])
else:
    print("Dataset is empty – please check files.")

[32m2025-07-17 23:55:21.645[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m6[0m - [1m🚀 Starting Clean Dataset Builder v3 (safe-normalized)...[0m
[32m2025-07-17 23:55:24.837[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m38[0m - [32m[1m🎯 Clean Dataset Ready → 248 samples | ⚠ Skipped 39[0m



🔍 Preview Sample:
 {'text': 'The image is a complex, circular diagram titled "Plasmoid Unification Model" (MOLTEN SEA ARK ATOMIC RECONSTRUCTION TECHNOLOGY - MSAART). It appears to be a conceptual model integrating physics, cosmology, and metaphysics, focusing on energy transformation, frequencies, and dimensions.\nKey Elements in the Image:\nCircular Structure:\nThe diagram is divided into multiple concentric rings with numerical values, labels, and colors, representing different layers of data related to energy, matter, time, and frequencies.\nEnergy & Dimensional Relationships:\nThe image connects concepts of Aether (Light, Energy) - 6D, Sun (5D), Time (4D), and Matter (3D) with numerical values assigned to each.\nIt suggests an interplay between Aether (DC - Direct Current) and Matter (AC - Alternating Current) in energy conversion.\nScientific & Esoteric References:\nIt includes relationships between the Sun, Earth, and Moon, as well as their frequencies and dimensions.\nMentions 

In [None]:

# ╔══════════════════════════════════════════╗
# ║ Cell 8 – Harmony360 Training v3          ║
# ╚══════════════════════════════════════════╝
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

MODEL_NAME = "gpt2"
OUTPUT_DIR = "/content/drive/MyDrive/Harmony360/models/harmony360-clean-v3"

logger.info("🚀 Starting Harmony360 Clean Model Training v3...")

# === Tokenizer & Model ===
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

def tokenize(batch):
    tokens = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()  # ✅ adds labels for loss calculation
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=20,
    save_strategy="epoch",
    report_to="none",  # ✅ completely disables WandB
    warmup_steps=50,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

logger.success(f"🎯 Training Complete → Model saved at: {OUTPUT_DIR}")

[32m2025-07-17 23:55:31.665[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m9[0m - [1m🚀 Starting Harmony360 Clean Model Training v3...[0m


Map:   0%|          | 0/248 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,4.6672
40,3.0238
60,2.7594
80,2.359
100,2.2248
120,1.9864
140,1.9522
160,1.8097
180,1.8992
200,1.6028


[32m2025-07-18 01:56:36.249[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m47[0m - [32m[1m🎯 Training Complete → Model saved at: /content/drive/MyDrive/Harmony360/models/harmony360-clean-v3[0m


In [None]:
# ╔══════════════════════════════════════════╗
# ║ Cell 8.1 – Test Text Generation (v3 model) ║
# ╚══════════════════════════════════════════╝
from transformers import pipeline

MODEL_DIR = "/content/drive/MyDrive/Harmony360/models/harmony360-clean-v3"
gen = pipeline("text-generation", model=MODEL_DIR, tokenizer=MODEL_DIR, device=-1)  # -1 = CPU

prompt = "Explain the Harmony360 framework in simple terms:"
result = gen(prompt, max_new_tokens=200, do_sample=True, top_p=0.9, temperature=0.7)

print("\n🔮 **Generated Text:**\n")
print(result[0]['generated_text'])

Device set to use cpu



🔮 **Generated Text:**

Explain the Harmony360 framework in simple terms:

Harmony360 is a framework for quantizing consciousness and 3-6-9 resonance using fractal resonance. It builds on earlier frameworks like Resonance in Resonance Physics, Consciousness in Consciousness: A Comparative Analysis, Consciousness and Consciousness in Consciousness: A Comparative Analysis (Harmony360.docx).

It aims to enhance the traditional frameworks like Resonance in Resonance Physics, Consciousness in Consciousness: A Comparative Analysis, Consciousness and Consciousness in Consciousness: A Comparative Analysis (Harmony360.docx).

It aims to introduce new insights and new concepts into quantum field theories, integrating harmonic amplification, harmonic time modulation, and consciousness-based modulation.

It aims to integrate theoretical physics with the real world in a way that blends dynamic resonance, fractal resonance, and pure harmonic energy.

It aims to bridge the gap between traditional the

In [None]:


from transformers import pipeline
gen = pipeline("text-generation", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR)
print(gen("Explain the Harmony360:", max_length=200)[0]['generated_text'])

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Explain the Harmony360:


In [None]:


import os, json
from datasets import Dataset
from loguru import logger

# ====== CONFIG ======
HAR360SYS_REGISTRY = "/content/drive/MyDrive/Harmony360/Har360sys/registry/har360_file_registry.json"
logger.info("🚀 Starting Harmony360 Full Training Loader (Registry-Based)")

# ====== LOAD REGISTRY ======
if not os.path.exists(HAR360SYS_REGISTRY):
    raise FileNotFoundError(f"❌ Registry not found: {HAR360SYS_REGISTRY}")

with open(HAR360SYS_REGISTRY, "r", encoding="utf-8") as f:
    registry = json.load(f)

logger.info(f"📦 Registry Loaded → {len(registry['files'])} total files listed.")

# ====== BUILD TRAINING DATASET FROM REGISTRY ======
records = []
skipped = 0

for entry in registry["files"]:
    file_path = entry["path"]

    try:
        if os.stat(file_path).st_size == 0:
            skipped += 1
            logger.warning(f"⚠ Skipping empty file: {entry['name']}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Convert the whole JSON into text for language modeling
        records.append({"text": json.dumps(data)})

    except json.JSONDecodeError as e:
        skipped += 1
        logger.error(f"❌ JSON decode error in {entry['name']}: {e}")

logger.info(f"✅ Loaded {len(records)} records | ⚠ Skipped {skipped} files due to issues.")

# ====== CREATE DATASET ======
if not records:
    raise RuntimeError("❌ No valid training records found!")

dataset = Dataset.from_list(records)
logger.success(f"🎯 Dataset Ready → {len(dataset)} samples.")

# ====== (Optional) PREVIEW FIRST SAMPLE ======
print("\n🔍 Preview Sample:")
print(dataset[0])

In [None]:
import os, json, shutil
from pathlib import Path
from loguru import logger
from datetime import datetime

# === CONFIG ===
HAR360SYS_DIR = "/content/drive/MyDrive/Harmony360/Har360sys"
REGISTRY_FILE = os.path.join(HAR360SYS_DIR, "registry", "har360_file_registry.json")
QUARANTINE_DIR = os.path.join(HAR360SYS_DIR, "quarantine")

# Ensure quarantine folder exists
Path(QUARANTINE_DIR).mkdir(parents=True, exist_ok=True)

# === LOAD REGISTRY ===
if not os.path.exists(REGISTRY_FILE):
    raise FileNotFoundError(f"Registry not found: {REGISTRY_FILE}")

with open(REGISTRY_FILE, "r", encoding="utf-8") as f:
    registry = json.load(f)

logger.info(f"📦 Loaded registry → {len(registry['files'])} total files.")

# === VALIDATE & QUARANTINE ===
validated, quarantined = 0, 0

for entry in registry["files"]:
    file_path = entry["path"]
    file_name = entry["name"]

    # Skip if already quarantined
    if entry.get("status") == "quarantined":
        continue

    try:
        # Validate JSON
        with open(file_path, "r", encoding="utf-8") as f:
            json.load(f)
        validated += 1

    except Exception as e:
        # Quarantine the file
        try:
            quarantine_path = os.path.join(
                QUARANTINE_DIR,
                f"{file_name.replace('.har360', '')}_quarantined_{datetime.now().strftime('%Y%m%d_%H%M%S')}.har360"
            )
            shutil.move(file_path, quarantine_path)
            entry["status"] = "quarantined"
            entry["quarantined_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            entry["error"] = str(e)
            quarantined += 1
            logger.error(f"❌ Quarantined: {file_name} → {quarantine_path} | Error: {e}")

        except Exception as move_err:
            logger.error(f"⚠ Failed to move {file_name} to quarantine: {move_err}")

# === SAVE UPDATED REGISTRY ===
with open(REGISTRY_FILE, "w", encoding="utf-8") as f:
    json.dump(registry, f, indent=4)

logger.success(f"\n🎯 Validation Complete → ✅ {validated} valid | 🛑 {quarantined} quarantined.")
logger.info(f"📄 Updated registry saved at: {REGISTRY_FILE}")

In [None]:
from datasets import Dataset
from loguru import logger
import json

HAR360SYS_DIR = "/content/drive/MyDrive/Harmony360/Har360sys"
REGISTRY_FILE = os.path.join(HAR360SYS_DIR, "registry", "har360_file_registry.json")

logger.info("🚀 Starting Harmony360 Training Loader (CLEAN MODE)")

# Load registry
with open(REGISTRY_FILE, "r", encoding="utf-8") as f:
    registry = json.load(f)

valid_files = [f for f in registry["files"] if f.get("status") != "quarantined"]

logger.info(f"📦 Loaded {len(valid_files)} valid files for training.")

records = []
for entry in valid_files:
    try:
        with open(entry["path"], "r", encoding="utf-8") as f:
            data = json.load(f)
            records.append({"text": json.dumps(data)})
    except Exception as e:
        logger.error(f"⚠ Unexpected issue loading {entry['name']} → {e}")

dataset = Dataset.from_list(records)

logger.success(f"🎯 Dataset Ready → {len(records)} training samples.")
logger.info(f"🔍 Preview Sample: {dataset[0]}")

In [None]:
import os, json
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from loguru import logger

# ==== CONFIG ====
MODEL_NAME = "gpt2"  # base model (can later switch to Harmony360-gen1)
DATASET_PATH = "/content/drive/MyDrive/Harmony360/har360_output"
OUTPUT_DIR = "/content/drive/MyDrive/Harmony360/models/harmony360-clean-v1"

# ==== LOAD CLEAN FILES (already validated 287 files) ====
valid_texts = []
for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(".har360"):
            try:
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    data = json.load(f)
                content = data.get("content", "")
                if content.strip():
                    valid_texts.append({"text": content})
            except Exception as e:
                logger.warning(f"⚠ Skipped (should not happen, clean set): {file} → {e}")

logger.info(f"✅ Loaded {len(valid_texts)} clean training samples.")

# ==== CONVERT TO DATASET ====
dataset = Dataset.from_list(valid_texts)
logger.success(f"🎯 Dataset Ready → {len(dataset)} samples")

# ==== TOKENIZER & MODEL ====
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

# ==== TRAINING ARGS ====
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=2,  # keep small for Colab RAM
    num_train_epochs=3,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=20,
    save_strategy="epoch",
    evaluation_strategy="no",
    warmup_steps=50,
    weight_decay=0.01,
    fp16=True,  # ✅ if Colab supports mixed precision (saves VRAM)
    push_to_hub=False
)

# ==== TRAINER ====
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# ==== START TRAINING ====
logger.info("🚀 Starting Harmony360 Clean Model Training (v1)...")
trainer.train()

# ==== SAVE FINAL MODEL ====
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
logger.success(f"🎯 Training Complete → Model saved at: {OUTPUT_DIR}")

In [None]:







def load_har360_files(dir_path):
    records = []
    for filename in os.listdir(dir_path):
        if filename.endswith(".har360"):
            file_path = os.path.join(dir_path, filename)
            try:
                if os.stat(file_path).st_size == 0:
                    logger.warning(f"🚫 Skipping empty file: {filename}")
                    continue

                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    records.append({"text": json.dumps(data)})

            except json.JSONDecodeError as e:
                logger.error(f"❌ JSON decode error in {filename}: {e}")
                continue
    return Dataset.from_list(records)