# Digital Twin v5.3: The Hybrid Ensemble (Definitive Retraining)

**Author:** Kian Mansouri Jamshidi
**Project Director:** Kian Mansouri Jamshidi
**Date:** 2025-09-27

## Objective
This notebook creates the definitive v5.3 Hybrid model. It uses the centralized `feature_extractor` utility to guarantee perfect consistency between the training environment and the production `PerformanceTitan`. It combines real telemetry data with real AST features to produce the final, scientifically valid, state-of-the-art model.

### 1. Imports and Setup

In [1]:
import pandas as pd
import numpy as np
import glob
import joblib
from pathlib import Path
import copy
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from pycparser import c_ast

# --- Path Setup & Import Single Source of Truth ---
PROJECT_ROOT = Path('.').resolve().parent
sys.path.append(str(PROJECT_ROOT))
from cosmos.parser import parser
from cosmos.foundry.feature_extractor import AstFeatureVisitor, create_time_series_features, AST_FEATURES_NAMES

TELEMETRY_DIR = PROJECT_ROOT / 'data' / 'telemetry_v2'
GENOME_DIR = PROJECT_ROOT / 'data' / 'genomes' / 'cronos'
ARTIFACT_DIR = PROJECT_ROOT / 'artifacts' / 'phase2'
ENSEMBLE_DIR = ARTIFACT_DIR / 'digital_twin_v5.3_hybrid_ensemble'
ENSEMBLE_DIR.mkdir(exist_ok=True)
print(f"Artifacts will be saved to: {ENSEMBLE_DIR}")

Artifacts will be saved to: /home/kian/Desktop/ForgeX4-COSMOS-Omega/artifacts/phase2/digital_twin_v5.3_hybrid_ensemble


### 2. Definitive Hybrid Data Generation

In [2]:
# 1. Extract REAL AST features using the centralized utility
ast_low = parser.parse_c_file_to_ast(str(GENOME_DIR / 'cronos_v1.0.c'))
ast_high = parser.parse_c_file_to_ast(str(GENOME_DIR / 'cronos_heavy_compute.c'))
visitor = AstFeatureVisitor()
features_low_dict = visitor.extract(ast_low)
visitor.__init__() # Reset visitor
features_high_dict = visitor.extract(ast_high)
print("AST features extracted.")

# 2. Load REAL telemetry and create the final hybrid dataframe
df_list = []
for parquet_file in TELEMETRY_DIR.glob('*.parquet'):
    df_workload = pd.read_parquet(parquet_file)
    # Create time-series features using the centralized utility
    df_ts_features = create_time_series_features(df_workload)
    
    workload_type = df_workload['workload_type'].iloc[0]
    ast_features_to_assign = features_high_dict if 'cpu_bound' in workload_type else features_low_dict
    for feature_name, value in ast_features_to_assign.items():
        df_ts_features[feature_name] = value
    df_list.append(df_ts_features)

df_final = pd.concat(df_list, ignore_index=True).drop(columns=['workload_type']).fillna(0)
print(f"Final hybrid training dataframe created. Shape: {df_final.shape}")

# 3. Split data for training
target = 'cpu_util_core_0'
features = [col for col in df_final.columns if col != target and col != 'timestamp']
X = df_final[features]
y = df_final[target]

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_full_indices = X_train_full.index
fold_A_indices, fold_B_indices = train_test_split(train_full_indices, test_size=0.5, random_state=42)
print(f"Data prepared. X_train shape: {X_train_full.shape}")

AST features extracted.


  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)
  return df_f.fillna(0)


Final hybrid training dataframe created. Shape: (6928, 69)
Data prepared. X_train shape: (5542, 67)


### 3. Training the Definitive Hybrid Ensemble

In [3]:
base_models = [
    LGBMRegressor(random_state=42),
    RandomForestRegressor(random_state=42, n_jobs=-1),
    ExtraTreesRegressor(random_state=42, n_jobs=-1),
    LGBMRegressor(n_estimators=200, random_state=123)
]
print("--- Training Base Models ---")
trained_on_A = [copy.deepcopy(m).fit(X_train_full.loc[fold_A_indices], y_train_full.loc[fold_A_indices]) for m in base_models]
preds_on_B = np.array([m.predict(X_train_full.loc[fold_B_indices]) for m in trained_on_A]).T
trained_on_B = [copy.deepcopy(m).fit(X_train_full.loc[fold_B_indices], y_train_full.loc[fold_B_indices]) for m in base_models]
preds_on_A = np.array([m.predict(X_train_full.loc[fold_A_indices]) for m in trained_on_B]).T
print("Base models trained.")

print("\n--- Creating Augmented Meta-Features ---")
X_meta_train_preds = np.vstack([preds_on_A, preds_on_B])
X_meta_train_orig = X_train_full.loc[np.concatenate([fold_A_indices, fold_B_indices])].values
X_meta_train = np.hstack([X_meta_train_orig, X_meta_train_preds])
y_meta_train = y_train_full.loc[np.concatenate([fold_A_indices, fold_B_indices])]
print(f"Meta-training set shape: {X_meta_train.shape}")

print("\n--- Training Meta-Model ---")
meta_model = RidgeCV()
meta_model.fit(X_meta_train, y_meta_train)
print("Meta-model trained.")

print("\n--- Training Final Base Models for Deployment ---")
final_base_models = [copy.deepcopy(m).fit(X_train_full, y_train_full) for m in base_models]
print("Final base models trained.")

print("\n--- Evaluating on Test Set ---")
base_test_preds = np.array([m.predict(X_test) for m in final_base_models]).T
X_meta_test = np.hstack([X_test.values, base_test_preds])
final_predictions = meta_model.predict(X_meta_test)
r2 = r2_score(y_test, final_predictions)
print(f"\n--- Final v5.3 HYBRID Ensemble Performance ---")
print(f"R-squared (R²): {r2:.4f}")

print("\n--- Saving Artifacts ---")
joblib.dump(final_base_models[0], ENSEMBLE_DIR / 'base_model_A.joblib')
joblib.dump(final_base_models[1], ENSEMBLE_DIR / 'base_model_B.joblib')
joblib.dump(final_base_models[2], ENSEMBLE_DIR / 'base_model_C.joblib')
joblib.dump(final_base_models[3], ENSEMBLE_DIR / 'base_model_D.joblib')
joblib.dump(meta_model, ENSEMBLE_DIR / 'meta_model.joblib')
print(f"All 5 model artifacts for the v5.3 Hybrid Ensemble saved to: {ENSEMBLE_DIR}")

--- Training Base Models ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.209446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 2771, number of used features: 58
[LightGBM] [Info] Start training from score 4.888488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2031
[LightGBM] [Info] Number of data points in the train set: 2771, number of used features: 58
[LightGBM] [Info] Start training from score 4.888488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_row_wise=true` to remove the overhead.