<a href="https://colab.research.google.com/github/sheebz/Amanda/blob/master/S05E05_Predict_Calorie_Expenditure_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Version tracking
VERSION = "v40"
#!pip install catboost
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from itertools import product
import kagglehub
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam


xgb_model = None
cb_model = None
lgb_model = None
xgb_grid = None
cb_grid = None
lgb_grid = None

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def add_categorical_aggregations(df, train_aggs=None):
    categorical_cols = ['Sex']
    numerical_cols = ['BMI', 'Heart_Rate', 'Body_Temp', 'MET_approx', 'Duration']
    if train_aggs is None:
        aggs = df.groupby('Sex')[numerical_cols].agg(['min', 'max', 'mean', 'std', 'var'])
        aggs.columns = [f"Sex_{num_col}_{stat}" for num_col, stat in aggs.columns]
        train_aggs = aggs
    df = df.merge(train_aggs, on='Sex', how='left')
    return df, train_aggs


def add_interactions_onehot(df, features, gender_col='Sex'):

    # Create one-hot columns (no need for get_dummies - we know it's binary)
    df['Male'] = df[gender_col]  # 1 if male, 0 otherwise
    df['Female'] = 1 - df[gender_col]  # Inverse

    # Create interactions
    for feat in features:
        df[f'{feat}_x_Male'] = df[feat] * df['Male']
        df[f'{feat}_x_Female'] = df[feat] * df['Female']

    # Drop temporary one-hot columns (optional)
    df.drop(['Male', 'Female'], axis=1, inplace=True)

    return df

# RMSLE scorer
def rmsle_scorer(y_true, y_pred):
    y_true = np.expm1(y_true)
    y_pred = np.expm1(y_pred)
    return np.sqrt(mean_squared_log_error(np.clip(y_true, 0, None), np.clip(y_pred, 0, None)))

rmsle_fn = make_scorer(rmsle_scorer, greater_is_better=False)

# BMR calculation
def get_bmr(row):
    if row['Sex'] == 0:  # Male
        return 66.5 + (13.75 * row['Weight']) + (5.003 * row['Height']) - (6.75 * row['Age'])
    else:  # Female
        return 655.1 + (9.563 * row['Weight']) + (1.850 * row['Height']) - (4.676 * row['Age'])


def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:  # bmi >= 30
        return 'Obese'


def calculate_bmi(height_cm, weight_kg):
  height_m = height_cm / 100
  bmi = weight_kg / (height_m ** 2)
  return bmi

# MET estimation (adjusted duration factor)
def estimate_met(weight, hr, hr_rest, max_hr, duration):
    net_hr = hr - hr_rest
    base_mets = 0.05 * net_hr + 2
    duration_factor = 1 + (duration / 60) * 0.05
    predicted_mets = base_mets * duration_factor
    return max(1, min(predicted_mets, 20))

def add_interaction(df, prefix, features):
  from pandas.api.types import is_numeric_dtype
  for feature in features:
    feature_col = df[feature]

    for col in df.columns.to_list():
      if col.startswith(prefix):
        #print(df[col].dtype)
        #print(df[col])
        if is_numeric_dtype(df[col].dtype):
          df[f'{feature}_x_{col}'] = df[feature] * df[col]

  return df

# Data preprocessing
def get_data(datafile):
    data = pd.read_csv(datafile)
    data['Sex'] = (data['Sex'] == 'female').astype('int8')

    # Outlier capping
    for col in ['Heart_Rate', 'Body_Temp']:
        q1, q3 = data[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - 1.0 * iqr, q3 + 1.0 * iqr
        data[col] = data[col].clip(lower, upper)

    # Features

    data['BMI'] = data.apply(lambda x: calculate_bmi(height_cm=x['Height'].astype('float32'),
                                                     weight_kg=x['Weight'].astype('float32')), axis=1)
    data['BMR'] = data.apply(get_bmr, axis=1)

    # categorical
    data['HR_Max'] = (220 - data['Age'])
    data['HR_Ratio'] = (data['Heart_Rate'] / data['HR_Max'])
    data['HR_Rest'] = data.apply(
        lambda x: (76 - 0.6 * x['Age']) * (1 - 0.2 * (1 - x['HR_Ratio'])) if (x['Sex'] == 0 and x['Age'] >= 61)
        else (80 - 0.6 * x['Age']) * (1 - 0.2 * (1 - x['HR_Ratio'])) if (x['Sex'] == 1 and x['Age'] >= 61)
        else (74 - 0.6 * x['Age']) * (1 - 0.2 * (1 - x['HR_Ratio'])) if x['Sex'] == 0
        else (78 - 0.6 * x['Age']) * (1 - 0.2 * (1 - x['HR_Ratio'])), axis=1
    )

    # Age Groups
    data['Age_Group'] = pd.cut(data['Age'], bins=[20, 40, 60, 79], labels=['Young', 'Middle', 'Senior'], include_lowest=True)
    data['Age_Young'] = (data['Age_Group'] == 'Young').astype('int8')
    data['Age_Middle'] = (data['Age_Group'] == 'Middle').astype('int8')
    data['Age_Senior'] = (data['Age_Group'] == 'Senior').astype('int8')
    data.drop('Age_Group', axis=1, inplace=True)

    # BMI Groups
    data['BMI_Class'] = data['BMI'].apply(categorize_bmi)
    data['BMI_Class'] = pd.Categorical(data['BMI_Class'], categories=[
        'Underweight', 'Normal', 'Overweight', 'Obese'
    ])
    data['BMI_Underweight'] = (data['BMI_Class'] == 'Underweight').astype('int8')
    data['BMI_Normalweight'] = (data['BMI_Class'] == 'Normal').astype('int8')
    data['BMI_Overweight'] = (data['BMI_Class'] == 'Overweight').astype('int8')
    data['BMI_Obese'] = (data['BMI_Class'] == 'Obese').astype('int8')
    data.drop('BMI_Class', axis=1, inplace=True)

    # interactions
    data['HR_Age'] = (data['Heart_Rate'] * data['Age']).astype('float32')
    data['Body_Temp_HR_Ratio'] = (data['Body_Temp'] * data['HR_Ratio']).astype('float32')
    #data['Body_Temp_Age_Interaction'] = (data['Body_Temp'] * data['Age_Group']).astype('float32')
    data['Heart_Rate_Body_Temp_Interaction'] = (data['Heart_Rate'] * data['Body_Temp']).astype('float32')
    data['Duration_Heart_Rate_Interaction'] = (data['Duration'] * data['Heart_Rate']).astype('float32')
    data['Sex_Heart_Rate_Interaction'] = (data['Sex'] * data['Heart_Rate']).astype('float32')
    data['MET_approx'] = data.apply(lambda x: estimate_met(x['Weight'], x['Heart_Rate'], x['HR_Rest'], x['HR_Max'], x['Duration']), axis=1).astype('float32')
    #data['Estimated_Calories'] = (data['BMR'] / 1440 * data['Duration'] * data['MET_approx']).astype('float32')
    data['Duration_MET_Interaction'] = (data['Duration'] * data['MET_approx']).astype('float32')  # New feature
    data['BMR_Duration'] = (data['BMR'] * data['Duration']).astype('float32')
    data['Female_Duration'] = data['Duration'].where(data['Sex'] == 1, 0).astype('float32')
    data['Male_Duration'] = data['Duration'].where(data['Sex'] == 0, 0).astype('float32')
    data['Duration_Body_Temp'] = (data['Duration'] * data['Body_Temp']).astype('float32')
    data['HR_Duration_Ratio'] = (data['Heart_Rate'] / data['Duration']).astype('float32')
    data['Age_Duration'] = (data['Age'] * data['Duration']).astype('float32')
    # v32
    data['Age_BMI_Interaction'] = (data['Age'] * data['BMI']).astype('float32')
    data['Heart_Rate_BMI_Interaction'] = (data['Heart_Rate'] * data['BMI']).astype('float32')
    data['Body_Temp_BMI_Interaction'] = (data['Body_Temp'] * data['BMI']).astype('float32')
    #data['Calories_per_Minute'] = (data['Estimated_Calories'] / data['Duration']).replace([np.inf, -np.inf], 0).astype('float32')
    #data['MET_BMI_Interaction'] = (data['MET_approx'] * data['BMI']).astype('float32')
    data['Duration_BMR_Interaction'] = (data['Duration'] * data['BMR']).astype('float32')
    #data['Heart_Rate_Estimated_Calories_Interaction'] = (data['Heart_Rate'] * data['Estimated_Calories']).astype('float32')

    for col in ['Age', 'Weight', 'Heart_Rate', 'Calories', 'Duration']:
        if col in data:
            data[col] = data[col].astype('float32')

    data = add_interaction(data, 'Age_', ['Duration', 'HR_Ratio', 'Body_Temp', 'MET_approx'])
    data = add_interaction(data, 'BMI_', ['Duration', 'HR_Ratio', 'Body_Temp', 'MET_approx'])

    data = data.drop(['Age', 'Height', 'Weight'], axis=1)
    return data

# Train Models

In [None]:
# Download dataset using kagglehub
playground_series_s5e5_path = kagglehub.competition_download('playground-series-s5e5')

train_data = get_data(os.path.join(playground_series_s5e5_path, 'train.csv'))
train_data, aggs = add_categorical_aggregations(train_data)

test_data = get_data(os.path.join(playground_series_s5e5_path, 'test.csv'))
test_data, _ = add_categorical_aggregations(test_data, aggs)
X_test_final = test_data.drop(['id'], axis=1)

X = train_data.drop(['id', 'Calories'], axis=1)
y = train_data['Calories']
y_log = np.log1p(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)
y_test = np.expm1(y_test_log)
y_train = np.expm1(y_train_log)

# Subsample data for CV
X_cv, _, y_log_cv, _ = train_test_split(X, y_log, train_size=0.2, random_state=42)

BEST_CB_PARAMS = {
    'learning_rate': 0.03,
    'l2_leaf_reg': 7,
    'iterations': 600,
    'depth': 9,
    'bagging_temperature': 0, # do not change, caused increase
    'random_seed': 42,
    'task_type': 'GPU',
    'devices': '0'
}

BEST_CB_PARAMS =  {'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 600, 'depth': 9, 'bagging_temperature': 0}
BEST_XGB_PARAMS = {'reg_lambda': 1.5, 'reg_alpha': 1, 'objective': 'reg:squaredlogerror', 'n_estimators': 600,
                   'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0}
BEST_LGBM_PARAMS = {'reg_alpha':0.5, 'reg_lambda':0.5, 'num_leaves': 60, 'min_child_samples': 30,
                    'max_depth': 20, 'learning_rate': 0.05}


# CatBoost
# if not cb_model:
print("Starting CatBoost training...")
cb_model = CatBoostRegressor(**BEST_CB_PARAMS, random_state=42, early_stopping_rounds=20, silent=True)
cb_model.fit(X_train, y_train_log, eval_set=[(X_test, y_test_log)])
y_pred_cb_log = cb_model.predict(X_test)
y_pred_cb = np.expm1(y_pred_cb_log)  # Fix: Transform log-predictions to original scale

y_pred_final_cb_log = cb_model.predict(X_test_final)
y_pred_final_cb = np.expm1(y_pred_final_cb_log)
submission_cb = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_cb, 0, None)})
submission_cb.to_csv(f'submission_cb_{VERSION}.csv', index=False)
print(f"CatBoost Test RMSLE ({VERSION}):", np.sqrt(mean_squared_log_error(y_test, np.clip(y_pred_cb, 0, None))))
print("Submission head:", submission_cb.head())
print("Submission stats:", submission_cb['Calories'].describe())
importance_cb = pd.Series(cb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(f"Catboost Feature Importance ({VERSION}): {importance_cb}")

# XGBoost
if not xgb_model:
  print("Starting XGBoost training...")
  xgb_model = XGBRegressor(**BEST_XGB_PARAMS, early_stopping_rounds=20, random_state=42)
  xgb_model.fit(X_train, y_train_log, eval_set=[(X_test, y_test_log)], verbose=False)
  y_pred_xgb_log = xgb_model.predict(X_test)
  y_pred_xgb = np.expm1(y_pred_xgb_log)

  y_pred_final_xgb_log = xgb_model.predict(X_test_final)
  y_pred_final_xgb = np.expm1(y_pred_final_xgb_log)
  submission_xgb = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_xgb, 0, None)})
  submission_xgb.to_csv(f'submission_xgb_{VERSION}.csv', index=False)
  print(f"XGBoost Test RMSLE ({VERSION}):", np.sqrt(mean_squared_log_error(y_test, np.clip(y_pred_xgb, 0, None))))
  importance_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
  print(f"XGBoost Feature Importance ({VERSION}) : {importance_xgb}")

#LightGB
if not lgb_model:
  print("Starting LightGBM training...")
  lgb_model = lgb.LGBMRegressor(**BEST_LGBM_PARAMS, random_state=42, silent=True)
  lgb_model.fit(X_train, y_train_log, eval_set=[(X_test, y_test_log)], callbacks=[lgb.early_stopping(stopping_rounds=10)])
  y_pred_lgb_log = lgb_model.predict(X_test)
  y_pred_lgb = np.expm1(y_pred_lgb_log)

  y_pred_final_lgb_log = lgb_model.predict(X_test_final)
  y_pred_final_lgb = np.expm1(y_pred_final_lgb_log)
  submission_lbg = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_lgb, 0, None)})
  submission_lbg.to_csv(f'submission_lbg_{VERSION}.csv', index=False)
  print(f"LightGBM Test RMSLE ({VERSION}):", np.sqrt(mean_squared_log_error(y_test, np.clip(y_pred_lgb, 0, None))))
  importance_lgb = pd.Series(lgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
  print(f"LightGBM Feature Importance ({VERSION}): {importance_lgb}")

# Use existing predictions{0.05, 0.15, 0.8}
weights = {'xgboost': 0.05, 'lightgbm': 0.15, 'catboost': 0.8}
y_pred_final_ensemble_log = (weights['xgboost'] * y_pred_final_xgb_log +
                            weights['lightgbm'] * y_pred_final_lgb_log +
                            weights['catboost'] * y_pred_final_cb_log)
y_pred_ensemble_log = (weights['xgboost'] * y_pred_xgb_log +
                      weights['lightgbm'] * y_pred_lgb_log +
                      weights['catboost'] * y_pred_cb_log)

# Generate new ensemble submission
y_pred_final_ensemble = np.expm1(y_pred_final_ensemble_log)
submission_ensemble = pd.DataFrame({
    'id': test_data['id'],
    'Calories': np.clip(y_pred_final_ensemble, 0, None)
})
submission_ensemble.to_csv(f'submission_ensemble_{VERSION}.csv', index=False)
print(f"New ensemble submission head: {submission_ensemble.head()}")
print(f"New ensemble Calories stats : {submission_ensemble['Calories'].describe()}")

# Standard CV for all models
models = [(xgb_model, "XGBoost"), (lgb_model, "LightGBM"), (cb_model, "CatBoost")]
print("Starting CV...")
scores_xgb, scores_lgb, scores_cb = None, None, None
for model, name in models:
    # For XGBoost, create a new instance without early stopping
    if name == "XGBoost":
        model = XGBRegressor(
            **{
                'learning_rate': 0.05,
                'max_depth': 7,
                'n_estimators': 500,
                'reg_alpha': 3.0,
                'reg_lambda': 2.0,
                'device': 'gpu'
            },
            random_state=42
        )  # No early_stopping_rounds
    scores = cross_val_score(model, X_cv, y_log_cv, cv=5, scoring=rmsle_fn)
    if name == "XGBoost":
        scores_xgb = scores
    elif name == "LightGBM":
        scores_lgb = scores
    elif name == "CatBoost":
        scores_cb = scores
    print(f"{name} 5-Fold CV RMSLE ({VERSION}): {-scores.mean():.6f} ± {scores.std():.6f}")
print("Finished CV...")

# Validate locally
y_pred_ensemble = np.expm1(y_pred_ensemble_log)
ensemble_rmsle = np.sqrt(mean_squared_log_error(y_test, np.clip(y_pred_ensemble, 0, None)))
print(f"New Ensemble Test RMSLE ({VERSION}):", ensemble_rmsle)


Starting CatBoost training...
CatBoost Test RMSLE (v38): 0.06021179069084656
Submission head:        id    Calories
0  750000   26.994362
1  750001  108.398871
2  750002   88.706049
3  750003  124.624074
4  750004   77.184430
Submission stats: count    250000.000000
mean         88.175968
std          62.255702
min           1.094434
25%          34.006268
50%          76.387600
75%         135.325510
max         289.725654
Name: Calories, dtype: float64
Catboost Feature Importance (v38): Duration_MET_Interaction            41.327692
Duration_Heart_Rate_Interaction     18.311058
Duration_Body_Temp                   6.623186
Duration_x_Age_Duration              6.429951
Heart_Rate_Body_Temp_Interaction     4.198497
Duration                             4.168359
Heart_Rate                           3.171085
Female_Duration                      1.949248
Body_Temp_HR_Ratio                   1.657421
HR_Ratio                             1.544641
Sex_Heart_Rate_Interaction           1.340992


# Tuning

In [None]:
def finetune_xgboost():
    X_train_sub, _, y_train_sub, _ = train_test_split(X, y_log, train_size=0.1, random_state=42)
    parameters = {
        'objective': ['reg:squaredlogerror'],
        'learning_rate': [0.05, 0.1, 0.3],
        'n_estimators': [600],
        'reg_alpha': [0.5, 1, 2],
        'reg_lambda': [0, 1, 2],
        'gamma': [0, 1, 2],
        'max_depth': [6, 8, 10]
    }
    xgb_model = XGBRegressor(device='cuda', random_state=42, n_jobs=-1, verbosity=2)
    grid = RandomizedSearchCV(xgb_model, parameters, n_iter=20, cv=3, scoring=rmsle_fn, random_state=42)
    print("Starting XGBoost tuning...")
    grid.fit(X_train_sub, y_train_sub)
    print("Finished XGBoost tuning...")
    print(f"Best XGBoost parameters ({VERSION}):", grid.best_params_)
    print(f"Best XGBoost CV RMSLE ({VERSION}):", -grid.best_score_)
    return grid

# Tune CatBoost
def finetune_catboost():
    X_train_sub, _, y_train_sub, _ = train_test_split(X, y_log, train_size=0.1, random_state=42)
    param_grid = {
        'iterations': [400, 600, 800],
        'depth': [9, 11, 13],
        'learning_rate': [0.03, 0.05, 0.07],
        'l2_leaf_reg': [3, 5, 7],
        'bagging_temperature': [0, 0.5, 1.0]
    }
    cb = CatBoostRegressor(random_seed=42, verbose=0, task_type='GPU', devices='0')
    grid = RandomizedSearchCV(cb, param_grid, n_iter=10, cv=3, scoring=rmsle_fn, random_state=42)
    print("Starting CatBoost tuning...")
    grid.fit(X_train_sub, y_train_sub, **{'early_stopping_rounds': 10})
    print("Finished CatBoost tuning...")
    print(f"Best CatBoost parameters ({VERSION}):", grid.best_params_)
    print(f"Best CatBoost CV RMSLE ({VERSION}):", -grid.best_score_)
    return grid

# Tune LightGBM
def finetune_lightgbm():
  X_train_sub, _, y_train_sub, _ = train_test_split(X, y_log, train_size=0.2, random_state=42)
  param_grid = {
      'num_leaves': [60, 90, 120, 150],
      'max_depth': [15, 20, 25, 30],
      'learning_rate': [0.01, 0.03, 0.05],
      'reg_alpha': [1.0, 1.5, 2.0],
      'reg_lambda': [0.3, 0.5, 0.7],
      'min_child_samples': [10, 20, 30]
  }
  lgb_model = lgb.LGBMRegressor(n_estimators=300, device='gpu', random_state=42, force_row_wise=True)
  grid = RandomizedSearchCV(lgb_model, param_grid, n_iter=10, cv=3, scoring=rmsle_fn, random_state=42)
  print("Starting LightGBM tuning...")
  grid.fit(X_train_sub, y_train_sub)
  print("Finished LightGBM tuning...")
  print(f"Best LightGBM parameters ({VERSION}):", grid.best_params_)
  print(f"Best LightGBM CV RMSLE ({VERSION}):", -grid.best_score_)
  return grid

if not cb_grid:
  cb_grid = finetune_catboost()
  BEST_CB_PARAMS = cb_grid.best_params_

if not xgb_grid:
  xgb_grid = finetune_xgboost()
  BEST_XGB_PARAMS = xgb_grid.best_params_

if not lgb_grid:
  lgb_grid = finetune_lightgbm()
  BEST_LGBM_PARAMS = lgb_grid.best_params_


In [None]:


# Manual ensemble weighting
weights = {'xgboost': 0.35, 'lightgbm': 0.30, 'catboost': 0.35}
print(f"Manual Ensemble Weights ({VERSION}):", weights)

# Generate ensemble submission using manual weights
X_test_final = test_data.drop(['id'], axis=1)
y_pred_final_xgb_log = xgb_model.predict(X_test_final)
y_pred_final_xgb = np.expm1(y_pred_final_xgb_log)
y_pred_final_lgb_log = lgb_model.predict(X_test_final)
y_pred_final_lgb = np.expm1(y_pred_final_lgb_log)
y_pred_final_cb_log = cb_model.predict(X_test_final)
y_pred_final_cb = np.expm1(y_pred_final_cb_log)
y_pred_final_ensemble = weights['lightgbm'] * y_pred_final_lgb + weights['xgboost'] * y_pred_final_xgb + weights['catboost'] * y_pred_final_cb
submission_ensemble = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_ensemble, 0, None)})
submission_ensemble.to_csv(f'submission_optimized_ensemble_{VERSION}.csv', index=False)

# Tune XGBoost meta-learner
def tune_meta_learner():
    meta_X = np.column_stack((y_pred_xgb_log, y_pred_lgb_log, y_pred_cb_log))
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [0, 0.1]
    }
    meta_model = XGBRegressor(random_state=42, device='gpu')
    grid = RandomizedSearchCV(meta_model, param_grid, n_iter=5, cv=5, scoring=rmsle_fn)
    print("Starting meta-learner tuning...")
    grid.fit(meta_X, y_test_log)
    print("Finished meta-learner tuning...")
    print(f"Best Meta-Learner Parameters ({VERSION}):", grid.best_params_)
    return grid.best_estimator_

meta_model_xgb = tune_meta_learner()

# Stacking with tuned XGBoost meta-learner
meta_X = np.column_stack((y_pred_xgb_log, y_pred_lgb_log, y_pred_cb_log))
meta_model_xgb.fit(meta_X, y_test_log)
meta_X_final = np.column_stack((y_pred_final_xgb_log, y_pred_final_lgb_log, y_pred_final_cb_log))
y_pred_final_ensemble_stack_xgb = meta_model_xgb.predict(meta_X_final)
y_pred_final_ensemble_stack_xgb = np.expm1(y_pred_final_ensemble_stack_xgb)
submission_stack_xgb = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_ensemble_stack_xgb, 0, None)})
submission_stack_xgb.to_csv(f'submission_stacking_xgb_tuned_{VERSION}.csv', index=False)

# Neural network meta-learner
def create_nn_meta_learner():
    model = Sequential([
        Input(shape=(3,)),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

print("Starting neural network training...")
meta_model_nn = create_nn_meta_learner()
meta_X = np.column_stack((y_pred_xgb_log, y_pred_lgb_log, y_pred_cb_log))
meta_model_nn.fit(meta_X, y_test_log, epochs=50, batch_size=512, verbose=0)
print("Finished neural network training...")
y_pred_final_ensemble_stack_nn = meta_model_nn.predict(np.column_stack((y_pred_final_xgb_log, y_pred_final_lgb_log, y_pred_final_cb_log)), verbose=0)
y_pred_final_ensemble_stack_nn = np.expm1(y_pred_final_ensemble_stack_nn.flatten())
submission_stack_nn = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_ensemble_stack_nn, 0, None)})
submission_stack_nn.to_csv(f'submission_stacking_nn_{VERSION}.csv', index=False)

# Combine ensemble and stacking predictions
y_pred_final_combined = 0.7 * y_pred_final_ensemble + 0.3 * y_pred_final_ensemble_stack_nn
submission_combined = pd.DataFrame({'id': test_data['id'], 'Calories': np.clip(y_pred_final_combined, 0, None)})
submission_combined.to_csv(f'submission_combined_ensemble_stack_{VERSION}.csv', index=False)

# Feature importance
importance_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
importance_lgb = pd.Series(lgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(f"XGBoost Feature Importance ({VERSION}):\n", importance_xgb)
print(f"LightGBM Feature Importance ({VERSION}):\n", importance_lgb)

# Residual plot
y_pred_ensemble = weights['lightgbm'] * y_pred_lgb + weights['xgboost'] * y_pred_xgb + weights['catboost'] * y_pred_cb
residuals = y_test - y_pred_ensemble
plt.scatter(y_test, residuals, alpha=0.5)
plt.xlabel('True Calories')
plt.ylabel('Residuals')
plt.axhline(0, color='red', linestyle='--')
plt.savefig(f'residuals_{VERSION}.png')
plt.close()

Starting CV...
XGBoost 5-Fold CV RMSLE (v32): 0.061711 ± 0.001715
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 5157
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 31
[LightGBM] [Info] Using GPU Device: NVIDIA L4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 30 dense feature groups (3.66 MB) transferred to GPU in 0.003600 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 4.136164
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 5158
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 31
[LightGBM] [Info] Using GPU Device: NVIDIA L4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Inf

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Finished meta-learner tuning...
Best Meta-Learner Parameters (v32): {'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}
Starting neural network training...
Finished neural network training...
XGBoost Feature Importance (v32):
 Duration_MET_Interaction            0.843303
Duration_Heart_Rate_Interaction     0.119072
Heart_Rate_Body_Temp_Interaction    0.009336
Sex                                 0.008075
Female_Duration                     0.004009
Sex_Heart_Rate_Interaction          0.003950
Body_Temp_HR_Ratio                  0.003177
HR_Ratio                            0.003029
Male_Duration                       0.002557
Heart_Rate                          0.000670
Age_Duration                        0.000447
Weight                              0.000430
MET_approx                          0.000420
Estimated_Calories                  0.000326
BMR                                 0.000189
HR_Age                              0.000175
Body_Temp_

In [None]:
# Updated best parameters (initial)
BEST_XGB_PARAMS = {'learning_rate': 0.05, 'max_depth': 7,
                   'n_estimators': 400, 'reg_alpha': 3.0,
                   'reg_lambda': 2.0,
                   "objective": 'reg:squaredlogerror', 'device':'gpu'}

BEST_XGB_PARAMS = {'gamma': 0, 'learning_rate': 0.1, 'n_estimators': 600,
                   'objective': 'reg:squaredlogerror', 'reg_alpha': 0.5, 'reg_lambda': 0}
BEST_XGB_PARAMS = {'reg_lambda': 5, 'reg_alpha': 0.5, 'objective': 'reg:squaredlogerror', 'n_estimators': 600, 'learning_rate': 0.1, 'gamma': 0}
BEST_LGBM_PARAMS = {'reg_lambda': 0.5, 'reg_alpha': 1.5,
                    'num_leaves': 60, 'n_estimators': 300, 'min_child_samples': 30,
                    'max_depth': 15,
                    'learning_rate': 0.05, 'device': 'gpu'}
BEST_CB_PARAMS = {'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 400, 'depth': 9, 'bagging_temperature': 0}
BEST_CB_PARAMS =  {'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 600, 'depth': 9, 'bagging_temperature': 0}
BEST_CB_PARAMS = {'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 600, 'depth': 13, 'bagging_temperature': 0.5}

