# world_model_v7 (with XGBoost)

## 1.1 Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge # Keep for comparison or if needed later
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import joblib
import os
import random

In [2]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
# xgb.DMatrix.set_random_state(42) # For older versions, newer ones use np.random

## 1.2 Load Dataset

In [3]:
def load_data(filepath="../dataset/dataset_v5.txt"):
    """Loads the dataset using pandas."""
    try:
        df = pd.read_csv(filepath)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
        df = df.dropna()
        print(f"Shape after dropping NaNs: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: Dataset file not found at {filepath}")
        return None
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

In [4]:
# 1. Load the dataset
dataframe = load_data()

Dataset loaded successfully. Shape: (3276, 14)
Shape after dropping NaNs: (3276, 14)


In [5]:
if dataframe is not None:
    dataframe.info()
else:
    print("DataFrame is None, skipping info().")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   distance_red_init     3276 non-null   float64
 1   angle_red_init        3276 non-null   float64
 2   distance_green_init   3276 non-null   float64
 3   angle_green_init      3276 non-null   float64
 4   distance_blue_init    3276 non-null   float64
 5   angle_blue_init       3276 non-null   float64
 6   rSpeed                3276 non-null   int64  
 7   lSpeed                3276 non-null   int64  
 8   distance_red_final    3276 non-null   float64
 9   angle_red_final       3276 non-null   float64
 10  distance_green_final  3276 non-null   float64
 11  angle_green_final     3276 non-null   float64
 12  distance_blue_final   3276 non-null   float64
 13  angle_blue_final      3276 non-null   float64
dtypes: float64(12), int64(2)
memory usage: 358.4 KB


## 1.3 Preprocess Dataset

In [6]:
def add_lagged_features(df, lag=1, columns_to_lag=None):
    """
    Adds lagged features to specific columns in the dataset.
    :param df: Original DataFrame
    :param lag: Number of lagged steps to include
    :param columns_to_lag: List of columns to apply lagging to
    :return: DataFrame with lagged features
    """
    if df is None:
        print("DataFrame is None, skipping lagged feature creation.")
        return None

    if columns_to_lag is None:
        columns_to_lag = df.columns  # Default to all columns if none specified

    lagged_df = df.copy()
    for i in range(1, lag + 1):
        lagged_columns = {col: f"{col}_lag{i}" for col in columns_to_lag}
        lagged_df = pd.concat([lagged_df, df[columns_to_lag].shift(i).rename(columns=lagged_columns)], axis=1)

    # Drop rows with NaN values introduced by lagging
    lagged_df = lagged_df.dropna().reset_index(drop=True)
    print(f"Lagged features added for columns {columns_to_lag} with lag={lag}. New shape: {lagged_df.shape}")
    return lagged_df

In [7]:
def prepare_data(df):
    """
    Separates features (X) and target variables (Y).
    :param df: DataFrame with lagged features
    :return: Features (X) and targets (Y)
    """
    if df is None:
        return None, None

    # Define the original feature and target columns
    # feature_columns = [
    #     "distance_red_init", "angle_red_init",
    #     "distance_green_init", "angle_green_init",
    #     "distance_blue_init", "angle_blue_init",
    #     "rSpeed", "lSpeed"
    # ]

    target_columns = [
        "distance_red_final", "angle_red_final",
        "distance_green_final", "angle_green_final",
        "distance_blue_final", "angle_blue_final"
    ]

    feature_columns = [col for col in df.columns if col not in target_columns]


    # Extract features and targets
    X = df[feature_columns].values
    Y = df[target_columns].values

    print(f"Features (X) shape: {X.shape}")
    print(f"Targets (Y) shape: {Y.shape}")
    return X, Y

In [8]:
def split_data(X, Y, test_size=0.2, random_state=42):
    """Splits data into training and testing sets."""
    if X is None or Y is None:
        return None, None, None, None
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=test_size, random_state=random_state
    )
    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"Testing set size: {X_test.shape[0]} samples")
    return X_train, X_test, Y_train, Y_test

In [9]:
# 2. Prepare Data
columns_to_lag = [
    "distance_red_init", "angle_red_init",
    "distance_green_init", "angle_green_init",
    "distance_blue_init", "angle_blue_init",
    "rSpeed", "lSpeed"
]

if dataframe is not None:
    dataframe = add_lagged_features(dataframe, lag=1, columns_to_lag=columns_to_lag)  # Add lagged features
    display(dataframe.info())
    X, Y = prepare_data(dataframe)
else:
    X, Y = None, None
    print("Skipping data preparation as dataframe is None.")

Lagged features added for columns ['distance_red_init', 'angle_red_init', 'distance_green_init', 'angle_green_init', 'distance_blue_init', 'angle_blue_init', 'rSpeed', 'lSpeed'] with lag=1. New shape: (3275, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3275 entries, 0 to 3274
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   distance_red_init         3275 non-null   float64
 1   angle_red_init            3275 non-null   float64
 2   distance_green_init       3275 non-null   float64
 3   angle_green_init          3275 non-null   float64
 4   distance_blue_init        3275 non-null   float64
 5   angle_blue_init           3275 non-null   float64
 6   rSpeed                    3275 non-null   int64  
 7   lSpeed                    3275 non-null   int64  
 8   distance_red_final        3275 non-null   float64
 9   angle_red_final           3275 non-null   float64
 10  distance_green_f

None

Features (X) shape: (3275, 16)
Targets (Y) shape: (3275, 6)


In [10]:
def scale_features(X_train, X_test):
    """Scales input features using StandardScaler."""
    if X_train is None or X_test is None:
        return None, None, None
    scaler = StandardScaler()
    # Fit scaler ONLY on training data
    X_train_scaled = scaler.fit_transform(X_train)
    # Transform both train and test data
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled.")
    return X_train_scaled, X_test_scaled, scaler # Return scaler to save it

In [11]:
# 3. Split Data
if X is not None and Y is not None:
    X_train, X_test, Y_train, Y_test = split_data(X, Y)
    # 4. Scale Features (Important!)
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
else:
    X_train, X_test, Y_train, Y_test = None, None, None, None
    X_train_scaled, X_test_scaled, scaler = None, None, None
    print("Skipping data splitting and scaling as X or Y is None.")

Training set size: 2620 samples
Testing set size: 655 samples
Features scaled.


## 1.4 Train model

### Lazy Predict

In [None]:
from lazypredict.Supervised import LazyRegressor
import time

# --- Quick Model Comparison with LazyPredict ---

print("\n--- Running LazyPredict ---")

# Initialize LazyRegressor for regression tasks
# It automatically handles multi-output regression
reg = LazyRegressor(
    verbose=0,
    ignore_warnings=True, # Suppress convergence/other warnings for a cleaner overview
    custom_metric=None,   # Use default metrics (R2, RMSE, MAE, Time Taken)
    predictions=False,     # We don't need the raw predictions for this overview
    random_state=42       # For reproducibility of models that use randomness
)

start_time_lazy = time.time()

# Fit and evaluate all models using the SCALED data
# Ensure X_train_scaled, X_test_scaled, Y_train, Y_test are available from previous cells
if 'X_train_scaled' in globals() and X_test_scaled is not None:
    try:
        models, predictions = reg.fit(X_train_scaled, X_test_scaled, Y_train, Y_test)
        end_time_lazy = time.time()
        print(f"LazyPredict finished in {end_time_lazy - start_time_lazy:.2f} seconds.")

        # Display the results DataFrame, sorted by a relevant metric
        print("\nLazyPredict Results (Sorted by RMSE):")
        print(models.sort_values(by='RMSE', ascending=True)) # Lower RMSE is better

        # You could also sort by R-Squared (higher is better):
        # print("\nLazyPredict Results (Sorted by R-Squared):")
        # print(models.sort_values(by='R-Squared', ascending=False))

    except Exception as e:
         print(f"\nAn error occurred during LazyPredict execution: {e}")
         print("Ensure input data shapes and types are correct.")

else:
    print("Skipping LazyPredict because scaled data (X_train_scaled, etc.) is not available.")

### Original Ridge Regression (for reference)

In [12]:
def train_ridge_regression(X_train, Y_train):
    """Trains a Ridge Regression model with hyperparameter optimization using
    GridSearchCV."""
    if X_train is None or Y_train is None:
        print("Skipping Ridge training as data is None.")
        return None
    print("Training Ridge Regression model with GridSearchCV...")
    param_grid = {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0, 1000000.0],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }
    ridge = Ridge(random_state=42)
    grid_search = GridSearchCV(
        estimator=ridge,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X_train, Y_train)
    print("\nGridSearchCV Complete for Ridge.")
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation score (negative MSE): {grid_search.best_score_:.4f}")
    return grid_search.best_estimator_

# # Example usage for Ridge (commented out if focusing on XGBoost)
# if X_train_scaled is not None and Y_train is not None:
#     print("\n--- Training Ridge Model ---")
#     world_model_ridge = train_ridge_regression(X_train_scaled, Y_train)
# else:
#     world_model_ridge = None

### XGBoost Regressor

In [14]:
def train_xgboost_regression(X_train, Y_train):
    """Trains an XGBoost Regressor model with hyperparameter optimization using GridSearchCV."""
    if X_train is None or Y_train is None:
        print("Skipping XGBoost training as data is None.")
        return None

    print("Training XGBoost Regressor model with GridSearchCV...")

    # Define the parameter grid for XGBoost
    # This is a starting grid. You might want to expand or refine it.
    # For faster initial runs, you can reduce the number of options or use a smaller cv.
    param_grid_xgb = {
        'n_estimators': [100, 200, 300], # Number of boosting rounds
        'learning_rate': [0.01, 0.05, 0.1], # Step size shrinkage
        'max_depth': [3, 5, 7], # Maximum depth of a tree
        'subsample': [0.7, 0.8, 1.0], # Subsample ratio of the training instance
        'colsample_bytree': [0.7, 0.8, 1.0], # Subsample ratio of columns when constructing each tree
        'gamma': [0, 0.1], # Minimum loss reduction required to make a further partition
        # 'reg_alpha': [0, 0.01, 0.1], # L1 regularization
        'reg_lambda': [0.1, 1, 10] # L2 regularization (XGBoost's default is 1 for lambda)
    }

    # Create the XGBoost regressor model
    # objective='reg:squarederror' is common for regression
    # XGBoost can use multiple cores for training individual models via n_jobs in its constructor
    # For multi-output regression, XGBoost handles it natively.
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

    # Set up GridSearchCV
    # cv=3 for quicker search, use cv=5 for more robust results
    grid_search_xgb = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid_xgb,
        scoring='neg_mean_squared_error', # Optimize for lower MSE
        cv=5, # Reduced for speed in this example; use 5 for better tuning
        n_jobs=-1, # Use all available CPU cores for GridSearchCV's parallel fits
        verbose=2 # Higher verbosity to see progress
    )

    # Fit GridSearchCV on the training data
    grid_search_xgb.fit(X_train, Y_train)

    # Print the best parameters and corresponding score
    print("\nGridSearchCV Complete for XGBoost.")
    print(f"Best parameters found: {grid_search_xgb.best_params_}")
    print(f"Best cross-validation score (negative MSE): {grid_search_xgb.best_score_:.4f}")

    # Return the best model found by GridSearchCV
    return grid_search_xgb.best_estimator_

In [15]:
# Train the XGBoost model
if X_train_scaled is not None and Y_train is not None:
    print("\n--- Training XGBoost Model ---")
    world_model_xgb = train_xgboost_regression(X_train_scaled, Y_train)
else:
    world_model_xgb = None
    print("Skipping XGBoost model training call.")


--- Training XGBoost Model ---
Training XGBoost Regressor model with GridSearchCV...
Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.7; total time=   0.3s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_lambda=0.1, subsample=0.7; total time=   0.3s
[CV

## 1.5 Evaluate

In [16]:
def evaluate_model(model, X_test, Y_test, model_name="Model"):
    """Evaluates the model using MAE, MSE, and RMSE."""
    if model is None or X_test is None or Y_test is None:
        print(f"Skipping evaluation for {model_name} as model or data is None.")
        return None, None

    Y_pred = model.predict(X_test)

    mae = mean_absolute_error(Y_test, Y_pred)
    mse = mean_squared_error(Y_test, Y_pred)
    rmse = np.sqrt(mse) # Root Mean Squared Error

    print(f"\n--- {model_name} Evaluation ---")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

    # Optional: Print metrics per output feature
    print("\nMAE per output feature:")
    output_features = [
        'dist_red_final', 'angle_red_final', 'dist_green_final',
        'angle_green_final', 'dist_blue_final', 'angle_blue_final'
    ]
    for i, name in enumerate(output_features):
        # Ensure Y_test and Y_pred are 2D, even if only one sample was predicted
        y_test_col = Y_test[:, i] if Y_test.ndim > 1 else Y_test
        y_pred_col = Y_pred[:, i] if Y_pred.ndim > 1 else Y_pred

        mae_feature = mean_absolute_error(y_test_col, y_pred_col)
        print(f"  {name}: {mae_feature:.4f}")

    return mae, mse

In [17]:
# Evaluate the XGBoost model
if world_model_xgb is not None and X_test_scaled is not None and Y_test is not None:
    print("\n--- Evaluating XGBoost Model ---")
    xgb_mae, xgb_mse = evaluate_model(world_model_xgb, X_test_scaled, Y_test, model_name="XGBoost")
    if xgb_mae is not None:
        print(f"XGBoost Test RMSE: {np.sqrt(xgb_mse):.4f}")
else:
    print("Skipping XGBoost model evaluation.")


--- Evaluating XGBoost Model ---

--- XGBoost Evaluation ---
Mean Absolute Error (MAE): 26.5363
Mean Squared Error (MSE): 2226.2277
Root Mean Squared Error (RMSE): 47.1829

MAE per output feature:
  dist_red_final: 14.8164
  angle_red_final: 33.4181
  dist_green_final: 15.9752
  angle_green_final: 40.8843
  dist_blue_final: 16.0813
  angle_blue_final: 38.0424
XGBoost Test RMSE: 47.1829


In [None]:
# Evaluate the XGBoost model (old)
if world_model_xgb is not None and X_test_scaled is not None and Y_test is not None:
    print("\n--- Evaluating XGBoost Model ---")
    xgb_mae, xgb_mse = evaluate_model(world_model_xgb, X_test_scaled, Y_test, model_name="XGBoost")
    if xgb_mae is not None:
        print(f"XGBoost Test RMSE: {np.sqrt(xgb_mse):.4f}")
else:
    print("Skipping XGBoost model evaluation.")


--- Evaluating XGBoost Model ---

--- XGBoost Evaluation ---
Mean Absolute Error (MAE): 24.9971
Mean Squared Error (MSE): 2175.8930
Root Mean Squared Error (RMSE): 46.6465

MAE per output feature:
  dist_red_final: 14.8359
  angle_red_final: 31.9380
  dist_green_final: 16.4533
  angle_green_final: 38.0901
  dist_blue_final: 15.6444
  angle_blue_final: 33.0206
XGBoost Test RMSE: 46.6465


## 1.6 Example Prediction

In [18]:
# Example prediction (how you'd use it later)
def example_prediction(model, X_test_original, Y_test_original, scaler_obj, model_name="Model"):
    if model is None or X_test_original is None or Y_test_original is None or scaler_obj is None:
        print(f"Skipping example prediction for {model_name} as essential components are missing.")
        return

    print(f"\n--- Example Prediction ({model_name}) ---")

    # Take the first sample from the original (unscaled) test set
    sample_X_orig = X_test_original[0].reshape(1, -1)
    sample_Y_actual = Y_test_original[0]

    # Scale the sample using the *saved* (or current) scaler
    sample_X_scaled = scaler_obj.transform(sample_X_orig)

    # Predict using the trained model
    sample_Y_pred = model.predict(sample_X_scaled)

    print(f"Input State + Action (Original): {sample_X_orig[0]}")
    print(f"Input State + Action (Scaled):   {sample_X_scaled[0]}")
    print(f"Actual Final State:              {sample_Y_actual}")
    print(f"Predicted Final State:           {sample_Y_pred[0]}")

In [19]:
# Example prediction with XGBoost model
if world_model_xgb and X_test is not None and Y_test is not None and scaler is not None:
    example_prediction(world_model_xgb, X_test, Y_test, scaler, model_name="XGBoost")
else:
    print("Skipping XGBoost example prediction due to missing components.")


--- Example Prediction (XGBoost) ---
Input State + Action (Original): [ 2.40447212e+02  2.24474152e+01  1.22459073e+03  8.72418891e+01
  1.08392337e+03  3.70584400e+01 -1.20000000e+01 -1.60000000e+01
  2.53098419e+02  1.39212699e+01  1.22599230e+03  7.91048203e+01
  1.09502619e+03  2.91509349e+01 -1.00000000e+00  5.00000000e+00]
Input State + Action (Scaled):   [-1.4440972  -1.55527371  1.36453177 -0.87776177  0.80226429 -1.41680449
 -0.67912385 -0.9435609  -1.40695855 -1.64067227  1.36368323 -0.93364881
  0.83231492 -1.47058995 -0.04940402  0.28918407]
Actual Final State:              [ 323.71655664    9.16729444 1237.80996378   74.82542519 1157.44558751
   26.28745497]
Predicted Final State:           [ 284.73975   246.66615  1232.974      62.054188 1135.8422     69.61528 ]


## 1.7 Save model

In [24]:
def save_model_and_scaler(model, scaler, model_filename="world_model_v5_XGB.joblib", scaler_filename="scaler_v5_XGB.joblib"):
    """Saves the trained model and scaler to disk."""
    if model is None or scaler is None:
        print("Skipping saving model/scaler as one of them is None.")
        return

    try:
        model_dir = "../src/models/" # Relative to script location
        os.makedirs(model_dir, exist_ok=True)

        model_path = os.path.join(model_dir, model_filename)
        scaler_path = os.path.join(model_dir, scaler_filename)

        joblib.dump(model, model_path)
        print(f"Model saved to {model_path}")
        joblib.dump(scaler, scaler_path)
        print(f"Scaler saved to {scaler_path}")
    except Exception as e:
        print(f"Error saving model/scaler: {e}")

In [25]:
# Save the XGBoost model and scaler
if world_model_xgb is not None and scaler is not None:
    print("\n--- Saving XGBoost Model and Scaler ---")
    save_model_and_scaler(world_model_xgb, scaler,
                          model_filename="xgb_world_model_v71.joblib",
                          scaler_filename="xgb_scaler_v71.joblib") # Use distinct names
else:
    print("Skipping XGBoost model saving.")


--- Saving XGBoost Model and Scaler ---
Model saved to ../src/models/xgb_world_model_v71.joblib
Scaler saved to ../src/models/xgb_scaler_v71.joblib
