In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Setting Matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8,5), dpi=220)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')
plt.tight_layout()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv


<Figure size 1760x1100 with 0 Axes>

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv', index_col='id')

# Data Understanding

In [3]:
train.shape

(750000, 8)

In [4]:
test.shape

(250000, 7)

In [5]:
train.head(10)

Unnamed: 0_level_0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
5,female,26,156.0,56.0,19.0,100.0,40.5,103.0
6,female,21,172.0,73.0,3.0,81.0,38.3,9.0
7,male,46,188.0,94.0,23.0,100.0,40.8,145.0
8,female,33,166.0,63.0,25.0,107.0,40.5,161.0
9,male,65,185.0,88.0,23.0,104.0,41.0,185.0


In [6]:
train.describe()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,41.420404,174.697685,75.145668,15.421015,95.483995,40.036253,88.282781
std,15.175049,12.824496,13.982704,8.354095,9.449845,0.779875,62.395349
min,20.0,126.0,36.0,1.0,67.0,37.1,1.0
25%,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,52.0,185.0,87.0,23.0,103.0,40.7,136.0
max,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [7]:
train.isnull().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [8]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         750000 non-null  object
 1   Age         750000 non-null  int64
 2   Height      750000 non-null  float64
 3   Weight      750000 non-null  float64
 4   Duration    750000 non-null  float64
 5   Heart_Rate  750000 non-null  float64
 6   Body_Temp   750000 non-null  float64
 7   Calories    750000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 52.2+ MB


# Data Preprocessing

## Reducing memory usage

In [9]:
train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

In [10]:
train['Age'] = train['Age'].astype('int8')
test['Age'] = test['Age'].astype('int8')

In [11]:
num_cols = test.select_dtypes(include='float64').columns
for col in num_cols:
    train[num_cols] = train[num_cols].astype('float32')
    test[num_cols] = test[num_cols].astype('float32')

In [12]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         750000 non-null  category
 1   Age         750000 non-null  int8
 2   Height      750000 non-null  float32
 3   Weight      750000 non-null  float32
 4   Duration    750000 non-null  float32
 5   Heart_Rate  750000 non-null  float32
 6   Body_Temp   750000 non-null  float32
 7   Calories    750000 non-null  float64
dtypes: category(1), float32(5), float64(1), int8(1)
memory usage: 27.2 MB


In [13]:
from sklearn.preprocessing import KBinsDiscretizer

kmeans = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='kmeans')
train['age_bins'] = kmeans.fit_transform(train[['Age']])
test['age_bins'] = kmeans.transform(test[['Age']])

# Feature Understanding via Data Visualization

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex']).astype('int8')
test['Sex'] = le.transform(test['Sex']).astype('int8')

In [15]:
train.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
       'Calories', 'age_bins'],
      dtype='object')

In [16]:
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [17]:
def add_features(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
    return df

In [18]:
def feature_transform(df):
    df['duration_expo'] = df['Duration'] ** 2
    df['duration_log'] = np.log1p(df['Duration'])

    df['heart_expo'] = df['Heart_Rate'] ** 2
    df['heart_log'] = np.log1p(df['Heart_Rate'])

    df['body_expo'] = df['Body_Temp'] ** 2
    df['body_log'] = np.log1p(df['Body_Temp'] ** 2)


    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    
    return df

In [19]:
train.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
       'Calories', 'age_bins'],
      dtype='object')

In [20]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y)
X = add_features(X)
X = feature_transform(X)

X_test = test.copy()
X_test = add_features(X_test)
X_test = feature_transform(X_test)

In [21]:
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=100)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'eval_metric': 'rmsle',
        'tree_method':'gpu_hist',
        'device':'cuda',
    }

    scores = []
    for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        xgb = XGBRegressor(**params, early_stopping_rounds=50, random_state=100).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        xgb_pred = xgb.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(xgb_pred)))
        scores.append(fold_rmse)

    return np.mean(scores)
    
    
study = optuna.create_study(direction='minimize', study_name='XGB-RMSE-Optimization')
study.optimize(objective, n_trials=12)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')

[I 2025-05-03 07:50:28,462] A new study created in memory with name: XGB-RMSE-Optimization
[I 2025-05-03 07:51:00,663] Trial 0 finished with value: 0.06049205819957685 and parameters: {'n_estimators': 1928, 'learning_rate': 0.09363150579450871, 'max_depth': 10, 'min_child_weight': 1, 'reg_alpha': 0.03075394991081595, 'reg_lambda': 0.2817035407634201, 'subsample': 0.8722826957044424, 'colsample_bytree': 0.6115778352187844, 'gamma': 0.1873131849241073}. Best is trial 0 with value: 0.06049205819957685.
[I 2025-05-03 07:51:58,576] Trial 1 finished with value: 0.06326293506271355 and parameters: {'n_estimators': 1691, 'learning_rate': 0.025471780745498277, 'max_depth': 3, 'min_child_weight': 1, 'reg_alpha': 2.5922706065581647, 'reg_lambda': 0.0278281793098291, 'subsample': 0.7812471806004735, 'colsample_bytree': 0.5070783194777522, 'gamma': 0.4302526142225117}. Best is trial 0 with value: 0.06049205819957685.
[I 2025-05-03 07:53:38,704] Trial 2 finished with value: 0.06275369841862814 and p

Best cross-validation RMSE: 0.05972
Best parameters: {'n_estimators': 1952, 'learning_rate': 0.005090096686488344, 'max_depth': 10, 'min_child_weight': 1, 'reg_alpha': 0.0876804809563427, 'reg_lambda': 9.131432978987068, 'subsample': 0.7835897739791482, 'colsample_bytree': 0.6396309543441028, 'gamma': 0.042073944894950116}


In [22]:
final_model = XGBRegressor(**study.best_params).fit(X, y)
test_xgb = final_model.predict(X_test)
test_xgb  = np.expm1(test_xgb)
y_preds = np.clip(test_xgb, 1, 314)

sub = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
sub['Calories'] = y_preds
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(6))

Your submission was successfully saved!
       id    Calories
0  750000   27.415216
1  750001  108.165894
2  750002   87.527657
3  750003  125.835953
4  750004   75.890327
5  750005   21.935183
