# Digital Twin v6.0: The Lexical Hybrid Ensemble (Definitive)

**NOTE:** This is V2 of the notebook. It includes a diagnostic cell to solve any `ModuleNotFoundError` issues.

### 1. Diagnostic & Path Validation

**Instructions:** Run this cell first. It will validate the project paths and ensure the necessary files are visible to the Jupyter kernel. If this cell succeeds, you can proceed to run the rest of the notebook.

In [3]:
import sys
from pathlib import Path
import os

# --- Definitive Path Validation ---
PROJECT_ROOT = Path('.').resolve().parent
# Insert at the front of the path to guarantee it's checked first
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print(f"Calculated Project Root: {PROJECT_ROOT}")
print(f"Python Executable:       {sys.executable}")

# --- Check for the existence of the critical file ---
foundry_dir = PROJECT_ROOT / 'cosmos' / 'foundry'
target_file = foundry_dir / 'feature_extractor_lexical.py'
print(f"--- Checking for Target File ---")
print(f"Checking in directory: {foundry_dir}")
print(f"Target file to import: {target_file}")
print(f"Target file exists:    {target_file.exists()}")

print("--- Contents of cosmos/foundry/ ---")
try:
    print(os.listdir(foundry_dir))
    print("\n[SUCCESS] Paths appear correct. You may now proceed.")
except FileNotFoundError:
    print("[ERROR] The 'cosmos/foundry' directory was not found!")

Calculated Project Root: /home/kian/Desktop/Clean_Projects/ForgeX4-COSMOS-Omega
Python Executable:       /home/kian/miniconda3/envs/cosmos/bin/python3.10
--- Checking for Target File ---
Checking in directory: /home/kian/Desktop/Clean_Projects/ForgeX4-COSMOS-Omega/cosmos/foundry
Target file to import: /home/kian/Desktop/Clean_Projects/ForgeX4-COSMOS-Omega/cosmos/foundry/feature_extractor_lexical.py
Target file exists:    False
--- Contents of cosmos/foundry/ ---
['titans.py', 'eature_extractor_lexical.py', 'foundry.py', 'titans_sentinel.py', 'mutations', 'foundry_sentinel.py', '__pycache__', 'feature_extractor.py', 'uranus_evolver.py', 'fitness.py']

[SUCCESS] Paths appear correct. You may now proceed.


### 2. Imports and Setup

In [5]:
import pandas as pd
import numpy as np
import glob
import joblib
import copy
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor

# --- THE DEFINITIVE FIX: Import the new, robust lexical extractor ---
from cosmos.foundry.feature_extractor_lexical import extract_lexical_features
from cosmos.foundry.feature_extractor import create_time_series_features # We still use this for dynamic features

TELEMETRY_DIR = PROJECT_ROOT / 'data' / 'telemetry_v2'
TARGET_FILE_PATH = PROJECT_ROOT / 'data' / 'genomes' / 'cjson' / 'cJSON.c'
ARTIFACT_DIR = PROJECT_ROOT / 'artifacts' / 'phase2'

ENSEMBLE_DIR = ARTIFACT_DIR / 'digital_twin_v6.0_lexical_ensemble'
ENSEMBLE_DIR.mkdir(exist_ok=True)
print(f"Definitive v6.0 model artifacts will be saved to: {ENSEMBLE_DIR}")

Definitive v6.0 model artifacts will be saved to: /home/kian/Desktop/Clean_Projects/ForgeX4-COSMOS-Omega/artifacts/phase2/digital_twin_v6.0_lexical_ensemble


### 3. Definitive Lexical Hybrid Data Generation

In [6]:
print(f"Generating lexical fingerprint for {TARGET_FILE_PATH.name}...")
with open(TARGET_FILE_PATH, 'r') as f:
    cjson_source_code = f.read()
static_lexical_features = extract_lexical_features(cjson_source_code)
print("Lexical features extracted successfully.")

df_list = []
print(f"Loading telemetry data from: {TELEMETRY_DIR}")
for parquet_file in TELEMETRY_DIR.glob('*.parquet'):
    df_workload = pd.read_parquet(parquet_file)
    df_ts_features = create_time_series_features(df_workload)
    
    for feature_name, value in static_lexical_features.items():
        df_ts_features[feature_name] = value
    df_list.append(df_ts_features)

df_final = pd.concat(df_list, ignore_index=True).drop(columns=['workload_type'], errors='ignore').fillna(0)
print(f"Final lexical hybrid training dataframe created. Shape: {df_final.shape}")

target = 'cpu_util_core_0'
features = [col for col in df_final.columns if col != target and col != 'timestamp']
X = df_final[features]
y = df_final[target]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_full_indices = X_train_full.index
fold_A_indices, fold_B_indices = train_test_split(train_full_indices, test_size=0.5, random_state=42)
print(f"Data prepared. X_train shape: {X_train_full.shape}")

Generating lexical fingerprint for cJSON.c...
Lexical features extracted successfully.
Loading telemetry data from: /home/kian/Desktop/Clean_Projects/ForgeX4-COSMOS-Omega/data/telemetry_v2


  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)


Final lexical hybrid training dataframe created. Shape: (6928, 76)
Data prepared. X_train shape: (5542, 74)


### 4. Training the Definitive v6.0 Lexical Ensemble

In [None]:
base_models = [
    LGBMRegressor(random_state=42),
    RandomForestRegressor(random_state=42, n_jobs=-1),
    ExtraTreesRegressor(random_state=42, n_jobs=-1),
    LGBMRegressor(n_estimators=200, random_state=123)
]
print("--- Training Base Models for Stacking ---")
trained_on_A = [copy.deepcopy(m).fit(X_train_full.loc[fold_A_indices], y_train_full.loc[fold_A_indices]) for m in base_models]
preds_on_B = np.array([m.predict(X_train_full.loc[fold_B_indices]) for m in trained_on_A]).T
trained_on_B = [copy.deepcopy(m).fit(X_train_full.loc[fold_B_indices], y_train_full.loc[fold_B_indices]) for m in base_models]
preds_on_A = np.array([m.predict(X_train_full.loc[fold_A_indices]) for m in trained_on_B]).T
print("Base models trained.")

print("\n--- Creating Augmented Meta-Features ---")
X_meta_train_preds = np.vstack([preds_on_A, preds_on_B])
X_meta_train_orig = X_train_full.loc[np.concatenate([fold_A_indices, fold_B_indices])].values
X_meta_train = np.hstack([X_meta_train_orig, X_meta_train_preds])
y_meta_train = y_train_full.loc[np.concatenate([fold_A_indices, fold_B_indices])]
print(f"Meta-training set shape: {X_meta_train.shape}")

print("\n--- Training Meta-Model ---")
meta_model = RidgeCV()
meta_model.fit(X_meta_train, y_meta_train)
print("Meta-model trained.")

print("\n--- Training Final Base Models for Deployment ---")
final_base_models = [copy.deepcopy(m).fit(X_train_full, y_train_full) for m in base_models]
print("Final base models trained.")

print("\n--- Evaluating on Test Set ---")
base_test_preds = np.array([m.predict(X_test) for m in final_base_models]).T
X_meta_test = np.hstack([X_test.values, base_test_preds])
final_predictions = meta_model.predict(X_meta_test)
r2 = r2_score(y_test, final_predictions)
print(f"\n--- Final v6.0 LEXICAL Hybrid Ensemble Performance ---")
print(f"R-squared (R²): {r2:.4f}")

print("\n--- Saving Artifacts ---")
joblib.dump(final_base_models[0], ENSEMBLE_DIR / 'base_model_A.joblib')
joblib.dump(final_base_models[1], ENSEMBLE_DIR / 'base_model_B.joblib')
joblib.dump(final_base_models[2], ENSEMBLE_DIR / 'base_model_C.joblib')
joblib.dump(final_base_models[3], ENSEMBLE_DIR / 'base_model_D.joblib')
joblib.dump(meta_model, ENSEMBLE_DIR / 'meta_model.joblib')
print(f"All 5 model artifacts for the v6.0 Lexical Hybrid Ensemble saved to: {ENSEMBLE_DIR}")