In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import cuml

# Setting Matplotlib defaults
plt.style.use('seaborn-v0_8')
plt.rc('figure', figsize=(8,5), dpi=150)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=15, titlepad=10)
plt.rc('animation', html='html5')
plt.tight_layout()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', 500)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv
/kaggle/input/calories-burnt-prediction/calories.csv


<Figure size 1200x750 with 0 Axes>

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv', index_col='id')
org = pd.read_csv('/kaggle/input/calories-burnt-prediction/calories.csv', index_col='User_ID')
org = org.rename(columns={'Gender': 'Sex'})

In [3]:
train = pd.concat([train, org], ignore_index=True)

In [4]:
strt_cols = list(test.columns)
print(train.drop_duplicates(subset=strt_cols + ['Calories']).shape, train.shape)

(762107, 8) (765000, 8)


# Data Understanding

In [5]:
test.shape

(250000, 7)

In [6]:
train.head(10)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
5,female,26,156.0,56.0,19.0,100.0,40.5,103.0
6,female,21,172.0,73.0,3.0,81.0,38.3,9.0
7,male,46,188.0,94.0,23.0,100.0,40.8,145.0
8,female,33,166.0,63.0,25.0,107.0,40.5,161.0
9,male,65,185.0,88.0,23.0,104.0,41.0,185.0


In [7]:
train.describe()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0,765000.0
mean,41.447255,174.693126,75.142162,15.423163,95.484672,40.036041,88.307424
std,15.213677,12.854173,14.004122,8.353421,9.452476,0.779863,62.39676
min,20.0,123.0,36.0,1.0,67.0,37.1,1.0
25%,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,52.0,185.0,87.0,23.0,103.0,40.7,136.0
max,79.0,222.0,132.0,30.0,128.0,41.5,314.0


In [8]:
train.isnull().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [9]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         765000 non-null  object
 1   Age         765000 non-null  int64
 2   Height      765000 non-null  float64
 3   Weight      765000 non-null  float64
 4   Duration    765000 non-null  float64
 5   Heart_Rate  765000 non-null  float64
 6   Body_Temp   765000 non-null  float64
 7   Calories    765000 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 47.4+ MB


# Data Preprocessing

## Reducing memory usage

In [10]:
train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

In [11]:
train['Age'] = train['Age'].astype('int8')
test['Age'] = test['Age'].astype('int8')

In [12]:
num_cols = test.select_dtypes(include='float64').columns
for col in num_cols:
    train[num_cols] = train[num_cols].astype('float32')
    test[num_cols] = test[num_cols].astype('float32')

In [13]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 765000 entries, 0 to 764999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   Sex         765000 non-null  category
 1   Age         765000 non-null  int8
 2   Height      765000 non-null  float32
 3   Weight      765000 non-null  float32
 4   Duration    765000 non-null  float32
 5   Heart_Rate  765000 non-null  float32
 6   Body_Temp   765000 non-null  float32
 7   Calories    765000 non-null  float64
dtypes: category(1), float32(5), float64(1), int8(1)
memory usage: 21.9 MB


In [14]:
mapping = {'male': 1, 'female': 0}

train['Sex'] = train['Sex'].map(mapping).astype('int8')
test['Sex'] = test['Sex'].map(mapping).astype('int8')

In [15]:
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [16]:
def add_features(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            #a = f'{c1}+{c2}'
            #df[a] = df[c1] + df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-3)
    return df

In [17]:
def feature_engineer(df):

    df['Max_Heart_Rate'] = 207 - (0.7 * df['Age'])
    df['HR_Ratio'] = df['Heart_Rate'] / df['Max_Heart_Rate']
    df['HR_Reserve'] = (df['Heart_Rate'] / df['Max_Heart_Rate']) * df['Duration']

    df['MET_Estimate'] = (df['Heart_Rate'] / df['Max_Heart_Rate']) * 15
    df['MET_Calories_Rate'] = df['MET_Estimate'] * 3.5 * df['Weight'] / 200
    
    df['MET_Calories_Total'] = df['MET_Calories_Rate'] * df['Duration']
    
    df['Exercise_Intensity_Index'] = df['Heart_Rate'] * df['Body_Temp'] / (df['Weight'] * df['Age'])
    df['Intensity'] = (df['Heart_Rate'] - 60) / df['HR_Reserve']

    df['exertion_score'] = df['Duration'] * df['Heart_Rate'] * df['Body_Temp']
    
    df['BMI'] = df['Weight'] / (df['Height']/100) ** 2

    for col in ['Duration', 'Heart_Rate']:
        df[f'Sqrt_{col}'] = np.sqrt(df[col])
    
    df['body_theta'] = np.arctan2(df['Weight'], df['Height'])
    
    return df

In [18]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y)

X = add_features(X)
X = feature_engineer(X)

X_test = test.copy()
X_test = add_features(X_test)
X_test = feature_engineer(X_test)

In [19]:
from sklearn.model_selection import KFold
from cuml.metrics import mean_squared_log_error
from xgboost import XGBRegressor

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=100)

oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

for fold, (train_index, valid_index) in enumerate(kf.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        xgb = XGBRegressor(
            n_estimators=3200, learning_rate= 0.010042341305141641, max_depth=10, 
            min_child_weight=7, alpha= 0.031799792818970524, reg_lambda=0.3580931274078332, 
            subsample=0.8572426766452907, colsample_bytree=0.5066402912386329, gamma=1.037011350094906e-08, 
            early_stopping_rounds=50, tree_method='gpu_hist', device='cuda', seed=100, 
            objective='reg:squarederror', eval_metric='rmse',
        ).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
    
        valid_pred = xgb.predict(X_valid)
        oof_xgb[valid_index] = valid_pred
        test_xgb += xgb.predict(X_test) / n_folds
    
        fold_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(valid_pred)))
        print(f"Fold {fold} RMSE: {fold_rmsle:.6f}")

test_xgb = np.expm1(test_xgb)
test_xgb = np.clip(test_xgb, 1, 314)
overall_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y), np.expm1(oof_xgb)))
print(f"\nOverall OOF RMSLE: {overall_rmsle:.6f}")   

Fold 1 RMSE: 0.058545
Fold 2 RMSE: 0.058850
Fold 3 RMSE: 0.059639
Fold 4 RMSE: 0.059512
Fold 5 RMSE: 0.058857

Overall OOF RMSLE: 0.059082


In [20]:
X = train.copy()
y = X.pop('Calories')
y = np.log1p(y) 
X = add_features(X)
X = feature_engineer(X)
X['Sex'] = X['Sex'].astype('category')

X_test = test.copy()
X_test = add_features(X_test)
X_test = feature_engineer(X_test)
X_test['Sex'] = X_test['Sex'].astype('category')

In [21]:
from catboost import CatBoostRegressor
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedKFold

bins = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
duration_bins = bins.fit_transform(train[['Duration']]).astype(int).flatten()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_cat = np.zeros(len(y))
test_cat = np.zeros(len(X_test))

params = {
    'iterations': 3000, 
    'learning_rate': 0.03813511357889326, 
    'depth': 9, 
    'l2_leaf_reg': 5.101121281815585, 
    'bagging_temperature': 0.6263562727371863, 
    'random_strength': 0.6489211175248135,
    'loss_function': 'RMSE',
    'cat_features': ['Sex'],
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 100,
    'verbose': 0,
    'random_seed': 100,
    'task_type': 'GPU'   
}

for fold, (train_index, valid_index) in enumerate(skf.split(X, duration_bins), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        cat = CatBoostRegressor(**params).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], use_best_model=True)
    
        valid_pred = cat.predict(X_valid)
        oof_cat[valid_index] = valid_pred
        test_cat += cat.predict(X_test) / n_folds
    
        fold_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(valid_pred)))
        print(f"Fold {fold} RMSE: {fold_rmsle:.6f}")

test_cat = np.expm1(test_cat)
test_cat = np.clip(test_cat, 1, 314)
overall_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y), np.expm1(oof_cat)))
print(f"\nOverall OOF RMSLE: {overall_rmsle:.6f}")


Fold 1 RMSE: 0.059025
Fold 2 RMSE: 0.058662
Fold 3 RMSE: 0.059499
Fold 4 RMSE: 0.059009
Fold 5 RMSE: 0.059988

Overall OOF RMSLE: 0.059238


In [22]:
from sklearn.linear_model import BayesianRidge

meta_train = np.column_stack((oof_xgb, oof_cat))
meta_test = np.column_stack((test_xgb, test_cat))

meta_model = BayesianRidge(n_iter=2000).fit(meta_train, y)

final_preds = meta_model.predict(meta_test)

In [23]:
sub = pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
sub['Calories'] = final_preds
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(5))

Your submission was successfully saved!
       id    Calories
0  750000   27.278030
1  750001  107.761077
2  750002   87.586033
3  750003  125.720489
4  750004   75.909367
