In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import cuml
import json
import joblib

from sklearn.model_selection import KFold, StratifiedKFold
from cuml.metrics import mean_squared_log_error
from xgboost import XGBRegressor

from catboost import CatBoostRegressor
from sklearn.preprocessing import KBinsDiscretizer

# Setting Matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8,5), dpi=150)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')
plt.tight_layout()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv', index_col='id')
org = pd.read_csv('/kaggle/input/calories-burnt-prediction/calories.csv', index_col='User_ID')
org = org.rename(columns={'Gender': 'Sex'})

In [None]:
train = pd.concat([train, org], ignore_index=True)

# Data Understanding

In [None]:
train.shape

In [None]:
train.head(10)

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
train.info()

# Data Preprocessing

## Reducing memory usage

In [None]:
train['Age'] = train['Age'].astype('int8')
test['Age'] = test['Age'].astype('int8')

In [None]:
num_cols = test.select_dtypes(include='float64').columns
for col in num_cols:
    train[num_cols] = train[num_cols].astype('float32')
    test[num_cols] = test[num_cols].astype('float32')

In [None]:
train.info()

In [None]:
mapping = {'male': 1, 'female': 0}

train['Sex'] = train['Sex'].map(mapping).astype('int8')
test['Sex'] = test['Sex'].map(mapping).astype('int8')

In [None]:
def add_statistical_features(df, features):
    df_new = df.copy()
    df_new["row_mean"] = df[features].mean(axis=1)
    df_new["row_max"] = df[features].max(axis=1)
    df_new["row_min"] = df[features].min(axis=1)
    df_new["row_median"] = df[features].median(axis=1)
    df_new["row_skew"] = df[features].skew(axis=1)
    df_new["row_unique"] = df[features].nunique(axis=1)
    
    return df_new

In [None]:
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [None]:
def add_features(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            a = f'{c1}+{c2}'
            df[a] = df[c1] + df[c2]
            s = f'{c1}-{c2}'
            df[s] = df[c1] - df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-3)
            rd = f'{c2}/{c1}'
            df[rd] = df[c2] / (df[c1] + 1e-3)
    return df

In [None]:
def feature_engineer(df):
    
    df['BMI'] = df['Weight'] / (df['Height']/100) ** 2
    
    Max_Heart_Rate = 207 - (0.7 * df['Age'])
    df['HR_Ratio'] = df['Heart_Rate'] / Max_Heart_Rate
    df['HR_Reserve'] =  df['HR_Ratio'] * df['Duration']
    df['Thermal_Load'] = df['Body_Temp'] * df['Duration'] * df['HR_Ratio']

    df['Anaerobic_Contribution'] = df['HR_Ratio']**2 * np.minimum(df['Duration'], 180) * df['Weight'] * 0.05
    
    MET_Estimate = (df['Heart_Rate'] / Max_Heart_Rate) * 15
    df['MET_Calories_Rate'] = MET_Estimate * 3.5 * df['Weight'] / 200
    
    df['EPOC_Factor'] = df['HR_Ratio']**2 * np.log1p(df['Duration']/10)
    df['MET_Calories_Total'] = df['MET_Calories_Rate'] * df['Duration']
    
    df['Exercise_Intensity_Index'] = df['Heart_Rate'] * df['Body_Temp'] / (df['Weight'] * df['Age'])
    df['Exercise_Economy'] = df['Heart_Rate'] / (df['Weight']**0.75)

    df['exertion_score'] = df['Duration'] * df['Heart_Rate'] * df['Body_Temp']
    df['Age_Adjusted_Exertion'] = df['exertion_score'] / (1 + 0.01 * (df['Age'] - 30))

    for col in ['Duration', 'Heart_Rate', 'Body_Temp']:
        df[f'Sqrt_{col}'] = np.sqrt(df[col])
    
    df['body_theta'] = np.arctan2(df['Weight'], df['Height'])

    df['CB_Proxy'] = np.where(
        df['Sex'] == 'female',
        df['Duration'] * (0.4472 * df['Heart_Rate'] - 0.1263 * df['Weight'] + 0.074 * df['Age'] -  55.0969) / 4.184,
        df['Duration'] * (0.6309 * df['Heart_Rate'] - 0.1988 * df['Weight'] + 0.2017 * df['Age'] -  55.0969) / 4.184,
    )
    
    return df

In [None]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y)

X = add_features(X)
X = feature_engineer(X)
X = add_statistical_features(X, features=features)

X_test = test.copy()
X_test = add_features(X_test)
X_test = feature_engineer(X_test)
X_test = add_statistical_features(X_test, features=features)

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=100)

oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

params = {
    'n_estimators': 2000, 
    'learning_rate': 0.012429613949090264, 
    'max_depth': 9, 
    'min_child_weight': 11, 
    'alpha': 0.031799792818970524, 
    'lambda': 0.3580931274078332, 
    'subsample': 0.8416119186946701, 
    'colsample_bytree': 0.5066402912386329, 
    'gamma': 1.037011350094906e-08,
    'grow_policy': 'lossguide'
}

for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        xgb = XGBRegressor(**params, early_stopping_rounds=100, tree_method='gpu_hist', device='cuda', seed=100, 
            objective='reg:squarederror', eval_metric='rmsle',
        ).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
    
        valid_pred = xgb.predict(X_valid)
        oof_xgb[valid_index] = valid_pred
        test_xgb += xgb.predict(X_test) / n_folds
    
        fold_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(valid_pred)))
        print(f"Fold {fold} RMSE: {fold_rmsle:.6f}")

test_xgb = np.expm1(test_xgb)
test_xgb = np.clip(test_xgb, 1, 314)

overall_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y), np.expm1(oof_xgb)))
print(f"\nOverall OOF RMSLE: {overall_rmsle:.6f}")   

In [None]:
out_path_oof = "/kaggle/working/oof_xgb.pkl"
joblib.dump(oof_xgb, out_path_oof)

out_path_test = "/kaggle/working/test_xgb.pkl"
joblib.dump(test_xgb, out_path_test)

print(f"OOF predictions saved to: {out_path_oof}")

In [None]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
sub['Calories'] = test_xgb
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(5))