In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("/Users/radhikapanchal/Downloads/playground-series-s5e5/train.csv")
test= pd.read_csv('/Users/radhikapanchal/Downloads/playground-series-s5e5/test.csv')
train_df = pd.read_csv("/Users/radhikapanchal/Downloads/playground-series-s5e5/train.csv")
test_df= pd.read_csv('/Users/radhikapanchal/Downloads/playground-series-s5e5/test.csv')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

# ---------------------------
# Load Data


# ---------------------------
# Feature Engineering
# ---------------------------
def feature_engineering(df):
    df['Intensity_Index'] = df['Heart_Rate'] / df['Duration']
    df['Age'] = np.log1p(df['Age'])
    df['Body_Temp'] = np.log1p(df['Body_Temp'])

    df['BMR'] = (
        10 * df['Weight'] +
        6.25 * df['Height'] -
        5 * df['Age'] +
        np.where(df['Sex'] == 'male', 5, -161)
    )

    df['HR_Temp_Interaction'] = df['Heart_Rate'] * df['Body_Temp']
    df['HR_Duration_Interaction'] = df['Heart_Rate'] * df['Duration']
    df['Metabolic_Load'] = df['Heart_Rate'] * df['Body_Temp'] * df['Duration']
    df['Age_Duration'] = df['Age'] * df['Duration']
    df['Age_Body_Temp'] = df['Age'] * df['Body_Temp']
    df['Duration_Body_Temp'] = df['Duration'] * df['Body_Temp']
    df['Age_Duration_Temp'] = df['Age'] * df['Duration'] * df['Body_Temp']
    df['Height_Weight'] = df['Height'] * df['Weight']
    df['Height_Duration'] = df['Height'] * df['Duration']
    df['Weight_Duration'] = df['Weight'] * df['Duration']
    df['Weight_HeartRate'] = df['Weight'] * df['Heart_Rate']
    df['Weight_BodyTemp'] = df['Weight'] * df['Body_Temp']
    df['Height_Temp_Interaction'] = df['Height'] * df['Body_Temp']
    df['Weight_Duration_Temp'] = df['Weight'] * df['Duration'] * df['Body_Temp']
    df['Height_Duration_Temp'] = df['Height'] * df['Duration'] * df['Body_Temp']
    df['Weight_HR_Duration'] = df['Weight'] * df['Heart_Rate'] * df['Duration']
    df['Height_HR_Duration'] = df['Height'] * df['Heart_Rate'] * df['Duration']
    df['Weight_Intensity_Index'] = df['Weight'] * df['Intensity_Index']
    df['Height_Intensity_Index'] = df['Height'] * df['Intensity_Index']
    df['Weight_HR_Temp_Interaction'] = df['Weight'] * df['HR_Temp_Interaction']
    df['Height_HR_Temp_Interaction'] = df['Height'] * df['HR_Temp_Interaction']

    if 'Calories' in df.columns:
        df['Calories'] = np.log1p(df['Calories'])

    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# ---------------------------
# Encode categorical
# ---------------------------
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

features = [col for col in train_df.columns if col not in ['id', 'Calories']]
X = train_df[features]
y = train_df['Calories']
X_test = test_df[features]

# ---------------------------
# Model Parameters
# ---------------------------
cat_params = {
    'max_depth': 6, 'l2_leaf_reg': 2, 'learning_rate': 0.03,
    'bagging_temperature': 0.08, 'border_count': 222,
    'loss_function': 'RMSE', 'random_state': 42, 'iterations': 4000
}


xgb_params = {
    'max_depth': 10, 'n_estimators': 3000, 'learning_rate': 0.07,
    'gamma': 0.01, 'max_delta_step': 2, 'eval_metric': 'rmse',
    'enable_categorical': False, 'random_state': 42,
    'tree_method': 'hist', 'n_jobs': -1
}

lgbm_params = {
    'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 3000,
    'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1,
    'num_leaves': 64, 'verbose': -1, 'n_jobs': -1, 'seed': 42
}


# ---------------------------
# Cross-Validation and Stacking
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_cat = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_lgbm = np.zeros(len(X))
test_cat = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_lgbm = np.zeros(len(X_test))

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # CatBoost
    cat = CatBoostRegressor(**cat_params)
    cat.fit(X_train, y_train, cat_features=['Sex'], eval_set=(X_val, y_val), verbose=0)
    oof_cat[val_idx] = cat.predict(X_val)
    test_cat += cat.predict(X_test) / kf.n_splits

    # XGBoost
    xgb = XGBRegressor(**xgb_params)
    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=200, verbose=0)
    oof_xgb[val_idx] = xgb.predict(X_val)
    test_xgb += xgb.predict(X_test) / kf.n_splits

    # LightGBM
    lgbm = lgb.LGBMRegressor(**lgbm_params)
    lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(200), lgb.log_evaluation(0)])
    oof_lgbm[val_idx] = lgbm.predict(X_val)
    test_lgbm += lgbm.predict(X_test) / kf.n_splits

# ---------------------------
# Meta-Model: Ridge Regression
# ---------------------------
meta_X = np.vstack([oof_cat, oof_xgb, oof_lgbm]).T
meta_test = np.vstack([test_cat, test_xgb, test_lgbm]).T

meta_model = Ridge(alpha=0.01)       # ✅ often best
meta_model.fit(meta_X, y)
final_preds = meta_model.predict(meta_test)
final_preds = np.expm1(final_preds.clip(min=0))

# ---------------------------
# Save submission
# ---------------------------
submission = pd.DataFrame({'id': test_df['id'], 'Calories': final_preds})
submission.to_csv("submission.csv", index=False)
print("✅ Saved 'submission.csv'")




Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2924]	valid_0's rmse: 0.0600593




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2509]	valid_0's rmse: 0.0619918




Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[2999]	valid_0's rmse: 0.0607605




Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[3000]	valid_0's rmse: 0.0609457




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2196]	valid_0's rmse: 0.0605047
✅ Saved 'submission.csv'
