In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
# Setting Matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8,5), dpi=150)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')
plt.tight_layout()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv
/kaggle/input/calories-burnt-prediction/calories.csv


<Figure size 1200x750 with 0 Axes>

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv', index_col='id')
org = pd.read_csv('/kaggle/input/calories-burnt-prediction/calories.csv', index_col='User_ID')
org = org.rename(columns={'Gender': 'Sex'})

In [3]:
train = pd.concat([train, org], ignore_index=True)

# Data Understanding

In [4]:
test.shape

(250000, 7)

In [5]:
train.head(10)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
5,female,26,156.0,56.0,19.0,100.0,40.5,103.0
6,female,21,172.0,73.0,3.0,81.0,38.3,9.0
7,male,46,188.0,94.0,23.0,100.0,40.8,145.0
8,female,33,166.0,63.0,25.0,107.0,40.5,161.0
9,male,65,185.0,88.0,23.0,104.0,41.0,185.0


In [6]:
train.describe()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0
mean,41.447255,174.693126,75.142162,15.423163,95.484672,40.036041,88.307424
std,15.213677,12.854173,14.004122,8.353421,9.452476,0.779863,62.39676
min,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,52.0,185.0,87.0,23.0,103.0,40.7,136.0
max,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [7]:
train.isnull().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         765000 non-null  object 
 1   Age         765000 non-null  int64  
 2   Height      765000 non-null  float64
 3   Weight      765000 non-null  float64
 4   Duration    765000 non-null  float64
 5   Heart_Rate  765000 non-null  float64
 6   Body_Temp   765000 non-null  float64
 7   Calories    765000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 46.7+ MB


# Data Preprocessing

## Reducing memory usage

In [9]:
train['Age'] = train['Age'].astype('int8')
test['Age'] = test['Age'].astype('int8')

In [10]:
num_cols = test.select_dtypes(include='float64').columns
for col in num_cols:
    train[num_cols] = train[num_cols].astype('float32')
    test[num_cols] = test[num_cols].astype('float32')

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Sex         765000 non-null  object 
 1   Age         765000 non-null  int8   
 2   Height      765000 non-null  float32
 3   Weight      765000 non-null  float32
 4   Duration    765000 non-null  float32
 5   Heart_Rate  765000 non-null  float32
 6   Body_Temp   765000 non-null  float32
 7   Calories    765000 non-null  float64
dtypes: float32(5), float64(1), int8(1), object(1)
memory usage: 27.0+ MB


# Feature Understanding via Data Visualization

In [12]:
mapping = {'male': 1, 'female': 0}

train['Sex'] = train['Sex'].map(mapping).astype('int8')
test['Sex'] = test['Sex'].map(mapping).astype('int8')

In [13]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y)
X_test = test.copy()

In [14]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import optuna

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=100)

oof_dnn = np.zeros(len(y))
test_dnn = np.zeros(len(X_test))

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout, GaussianDropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, SGD, Nadam
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping


2025-05-30 07:49:57.951744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748591398.132832      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748591398.187986      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
def build_model():
    return Sequential([
        Input(shape=(input_shape)),
        BatchNormalization(),
        
        Dense(450, activation='relu'),
        BatchNormalization(),
        Dropout(rate=0.25),
        
        Dense(250, activation='relu'),
        BatchNormalization(),
        Dropout(rate=0.1),
        
        Dense(125, activation='relu'),
        BatchNormalization(),
        Dropout(rate=0.1),
        
        Dense(300, activation='relu'),
        BatchNormalization(),
        Dropout(rate=0.05),
        
        Dense(1,activation='linear'),
    ])

In [17]:
def make_callbacks():
    lr_callback = ReduceLROnPlateau(
            monitor='val_root_mean_squared_error',     
            factor=0.5,              
            patience=4,              
            verbose=1,               
            min_lr=1e-4            
        )
    
    early_stop = EarlyStopping(
        patience=20, 
        monitor='val_root_mean_squared_error', 
        restore_best_weights=True, 
        mode='min'
    )

    checkpoint = ModelCheckpoint(
        filepath="/kaggle/working/best_model.keras",
        monitor='val_root_mean_squared_error',
        save_best_only=True,
        mode='min',
        verbose=0
    )
    
    return [lr_callback, early_stop, checkpoint]

In [18]:
for fold, (train_idx, valid_idx) in enumerate(kf.split(X), start=1):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    X_test = test.copy()
    
    scaler = MinMaxScaler()
    X_train= scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)

    input_shape = (X_train.shape[1], )

    dnn = build_model()
    optimizer = Adam(learning_rate=0.0005)
    rmse = RootMeanSquaredError()
    
    dnn.compile(optimizer=optimizer, loss='mse', metrics=[rmse])
    
    dnn.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=250,
    epochs=30,
    callbacks=make_callbacks(),
    verbose=2
    )

    y_pred = dnn.predict(X_valid, batch_size=256, verbose=2).flatten()
    oof_dnn[valid_idx] = y_pred
    
    test_dnn += dnn.predict(X_test, batch_size=256, verbose=2).flatten()
    
    rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_pred), np.expm1(y_valid)))
    print(f'Fold: {fold} RMSLE: {rmsle:,.6f}')

overall_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y), np.expm1(oof_dnn)))
print(f"\nOverall OOF RMSLE: {overall_rmsle:.6f}")  

test_dnn /= n_folds 

I0000 00:00:1748591410.121258      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1748591410.121921      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Epoch 1/30


I0000 00:00:1748591415.744380      63 service.cc:148] XLA service 0x7ab108009900 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748591415.744874      63 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1748591415.744919      63 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1748591416.198578      63 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1748591419.703165      63 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


2448/2448 - 15s - 6ms/step - loss: 0.9134 - root_mean_squared_error: 0.9557 - val_loss: 0.0113 - val_root_mean_squared_error: 0.1062 - learning_rate: 5.0000e-04
Epoch 2/30
2448/2448 - 6s - 3ms/step - loss: 0.0945 - root_mean_squared_error: 0.3074 - val_loss: 0.0093 - val_root_mean_squared_error: 0.0965 - learning_rate: 5.0000e-04
Epoch 3/30
2448/2448 - 6s - 3ms/step - loss: 0.0499 - root_mean_squared_error: 0.2234 - val_loss: 0.0076 - val_root_mean_squared_error: 0.0874 - learning_rate: 5.0000e-04
Epoch 4/30
2448/2448 - 6s - 2ms/step - loss: 0.0346 - root_mean_squared_error: 0.1859 - val_loss: 0.0052 - val_root_mean_squared_error: 0.0723 - learning_rate: 5.0000e-04
Epoch 5/30
2448/2448 - 6s - 2ms/step - loss: 0.0270 - root_mean_squared_error: 0.1642 - val_loss: 0.0061 - val_root_mean_squared_error: 0.0778 - learning_rate: 5.0000e-04
Epoch 6/30
2448/2448 - 6s - 2ms/step - loss: 0.0236 - root_mean_squared_error: 0.1535 - val_loss: 0.0057 - val_root_mean_squared_error: 0.0758 - learning_r

In [19]:
out_path_oof = "/kaggle/working/oof_dnn.pkl"
joblib.dump(oof_dnn, out_path_oof)

out_path_test = "/kaggle/working/test_dnn.pkl"
joblib.dump(test_dnn, out_path_test)

print(f"OOF predictions saved to: {out_path_oof}")

OOF predictions saved to: /kaggle/working/oof_dnn.pkl


In [20]:
np.save(f"oof_dnn",oof_dnn)
np.save(f"test_dnn",test_dnn)

In [21]:
test_dnn = np.expm1(test_dnn)
test_dnn = np.clip(test_dnn, 1, 314)

sub = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
sub['Calories'] = test_dnn
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(5))

Your submission was successfully saved!
       id    Calories
0  750000   27.612205
1  750001  107.759952
2  750002   87.328203
3  750003  126.014787
4  750004   75.982370
