In [1]:
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Setting Matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8,5), dpi=180)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')
plt.tight_layout()

import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<Figure size 1440x900 with 0 Axes>

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv', index_col='id')

org = pd.read_csv("/kaggle/input/calories-burnt-prediction/calories.csv", index_col='User_ID')
org = org.rename(columns={'Gender': 'Sex'})

In [3]:
train = pd.concat([train, org], ignore_index=True)

In [4]:
strt_cols = list(test.columns)
print(train.drop_duplicates(subset=strt_cols + ['Calories']).shape, train.shape)

(762107, 8) (765000, 8)


# Data Understanding

In [5]:
train.shape

(765000, 8)

In [6]:
test.shape

(250000, 7)

In [7]:
train.head(10)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
5,female,26,156.0,56.0,19.0,100.0,40.5,103.0
6,female,21,172.0,73.0,3.0,81.0,38.3,9.0
7,male,46,188.0,94.0,23.0,100.0,40.8,145.0
8,female,33,166.0,63.0,25.0,107.0,40.5,161.0
9,male,65,185.0,88.0,23.0,104.0,41.0,185.0


In [8]:
train.describe()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0
mean,41.447255,174.693126,75.142162,15.423163,95.484672,40.036041,88.307424
std,15.213677,12.854173,14.004122,8.353421,9.452476,0.779863,62.39676
min,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,52.0,185.0,87.0,23.0,103.0,40.7,136.0
max,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [9]:
train.isnull().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [10]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         765000 non-null  object
 1   Age         765000 non-null  int64
 2   Height      765000 non-null  float64
 3   Weight      765000 non-null  float64
 4   Duration    765000 non-null  float64
 5   Heart_Rate  765000 non-null  float64
 6   Body_Temp   765000 non-null  float64
 7   Calories    765000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 47.4+ MB


# Data Preprocessing

## Reducing memory usage

In [11]:
train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

In [12]:
train['Age'] = train['Age'].astype('int8')
test['Age'] = test['Age'].astype('int8')

In [13]:
num_cols = test.select_dtypes(include='float64').columns
for col in num_cols:
    train[num_cols] = train[num_cols].astype('float32')
    test[num_cols] = test[num_cols].astype('float32')

In [14]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         765000 non-null  category
 1   Age         765000 non-null  int8
 2   Height      765000 non-null  float32
 3   Weight      765000 non-null  float32
 4   Duration    765000 non-null  float32
 5   Heart_Rate  765000 non-null  float32
 6   Body_Temp   765000 non-null  float32
 7   Calories    765000 non-null  float64
dtypes: category(1), float32(5), float64(1), int8(1)
memory usage: 21.9 MB


In [15]:
from sklearn.preprocessing import KBinsDiscretizer

kmeans = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
train['age_bins'] = kmeans.fit_transform(train[['Age']])
test['age_bins'] = kmeans.transform(test[['Age']])

# Feature Understanding via Data Visualization

In [16]:
mapping = {'male': 1, 'female': 0}

train['Sex'] = train['Sex'].map(mapping)
test['Sex'] = test['Sex'].map(mapping)

In [17]:
train.head()

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,age_bins
0,1,36,189.0,82.0,26.0,101.0,41.0,150.0,1.0
1,0,64,163.0,60.0,8.0,85.0,39.700001,34.0,3.0
2,0,51,161.0,64.0,7.0,84.0,39.799999,29.0,2.0
3,1,20,192.0,90.0,25.0,105.0,40.700001,140.0,0.0
4,0,38,166.0,61.0,25.0,102.0,40.599998,146.0,1.0


In [18]:
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [19]:
def add_features(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]  
    return df

In [20]:
def feature_engineer(df):
    df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)
    df['Heart_Rate_Per_Minute'] = df['Heart_Rate'] / df['Duration']

    return df

In [21]:
train.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
       'Calories', 'age_bins'],
      dtype='object')

In [22]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y)
#X = add_features(X)
#X = feature_engineer(X)

X_test = test.copy()
#X_test = add_features(X_test)
#X_test = feature_engineer(X_test)

In [23]:
from sklearn.model_selection import cross_val_score
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=100)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2900),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 12),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.5, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.5, 1.0),
        'eval_metric': 'RMSE',
        'task_type':'GPU',
        'verbose': 0
    }

    scores = []
    for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        cat = CatBoostRegressor(**params, early_stopping_rounds=100, random_state=100).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0, use_best_model=True)
        cat_pred = cat.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(cat_pred)))
        scores.append(fold_rmse)

    return np.mean(scores)
    
    
study = optuna.create_study(direction='minimize', study_name='CAT-RMSE-Optimization')
study.optimize(objective, n_trials=10)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')

[I 2025-05-08 09:25:38,011] A new study created in memory with name: CAT-RMSE-Optimization
[I 2025-05-08 09:26:13,480] Trial 0 finished with value: 0.0623805978037687 and parameters: {'iterations': 1505, 'learning_rate': 0.013901587038867975, 'depth': 3, 'l2_leaf_reg': 0.3170921600345826, 'bagging_temperature': 0.5490630916925989, 'random_strength': 0.8148218310839448}. Best is trial 0 with value: 0.0623805978037687.
[I 2025-05-08 09:26:58,080] Trial 1 finished with value: 0.06033036369266755 and parameters: {'iterations': 1965, 'learning_rate': 0.033023582035792566, 'depth': 3, 'l2_leaf_reg': 1.6717513001217132, 'bagging_temperature': 0.9974144133429168, 'random_strength': 0.8216132788958227}. Best is trial 1 with value: 0.06033036369266755.
[I 2025-05-08 09:27:48,810] Trial 2 finished with value: 0.0597195916994358 and parameters: {'iterations': 2269, 'learning_rate': 0.06953986698642552, 'depth': 3, 'l2_leaf_reg': 0.00931025364693324, 'bagging_temperature': 0.7893703217772643, 'rand

Best cross-validation RMSE: 0.05887
Best parameters: {'iterations': 2233, 'learning_rate': 0.03813511357889326, 'depth': 9, 'l2_leaf_reg': 5.101121281815585, 'bagging_temperature': 0.6263562727371863, 'random_strength': 0.6489211175248135}


In [24]:
final_model = CatBoostRegressor(**study.best_params).fit(X, y)
test_cat = final_model.predict(X_test)
test_cat  = np.expm1(test_cat)
test_cat = np.clip(test_cat, 1, 314)

sub = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
sub['Calories'] = test_cat
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(6))

0:	learn: 0.9278874	total: 147ms	remaining: 5m 27s
1:	learn: 0.8940540	total: 237ms	remaining: 4m 24s
2:	learn: 0.8614762	total: 323ms	remaining: 4m
3:	learn: 0.8302523	total: 426ms	remaining: 3m 57s
4:	learn: 0.8001913	total: 492ms	remaining: 3m 39s
5:	learn: 0.7711851	total: 555ms	remaining: 3m 26s
6:	learn: 0.7434264	total: 610ms	remaining: 3m 14s
7:	learn: 0.7166303	total: 667ms	remaining: 3m 5s
8:	learn: 0.6908561	total: 747ms	remaining: 3m 4s
9:	learn: 0.6660364	total: 815ms	remaining: 3m 1s
10:	learn: 0.6421575	total: 872ms	remaining: 2m 56s
11:	learn: 0.6191998	total: 927ms	remaining: 2m 51s
12:	learn: 0.5971650	total: 983ms	remaining: 2m 47s
13:	learn: 0.5758494	total: 1.04s	remaining: 2m 44s
14:	learn: 0.5553486	total: 1.1s	remaining: 2m 42s
15:	learn: 0.5356220	total: 1.15s	remaining: 2m 39s
16:	learn: 0.5167154	total: 1.2s	remaining: 2m 37s
17:	learn: 0.4984791	total: 1.26s	remaining: 2m 35s
18:	learn: 0.4808482	total: 1.31s	remaining: 2m 33s
19:	learn: 0.4639695	total: 1.3