In [1]:
import os
os.chdir('../..')

RUN_ON_KAGGLE = False

In [2]:
RUN_ON_KAGGLE = True

# First, upgrade scikit-learn to 1.6.1 (quietly)
!pip install -qq scikit-learn==1.6.1

# Then, reinstall category-encoders without enforcing its old scikit-learn requirement
!pip install -qq category-encoders --no-deps --force-reinstall

# Optionally, reinstall bigframes without enforcing its old rich requirement (only if you use it)
!pip install -qq bigframes --no-deps --force-reinstall

In [3]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Avoid duplicate handlers
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)  # stdout works better than stderr in Jupyter
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

In [4]:
import numpy as np
from sklearn.model_selection import KFold
import time
from sklearn.metrics import root_mean_squared_error, mean_squared_log_error
import lightgbm as lgb

def rmsle_sklearn(y_true, y_pred):
    score = rmsle(y_true, y_pred)
    return 'rmsle', score, False

def rmsle(y_true, y_pred):
    """Calculates Root Mean Squared Logarithmic Error safely."""
    y_pred_clipped = np.maximum(y_pred, 0)
    y_true_clipped = np.maximum(y_true, 0)
    return np.sqrt(mean_squared_log_error(y_true_clipped, y_pred_clipped))

def rmse_sklearn(y_true, y_pred):
    score = rmse(y_true, y_pred)
    return 'rmse', score, False

def rmse(y_true, y_pred):
    """Calculates Root Mean Squared Error safely."""
    return root_mean_squared_error(y_true, y_pred)

class BaseModelWrapper:
    def __init__(self, model_cls, model_params, eval_metric=rmsle, eval_metric_sklearn=rmsle_sklearn, name='lgbm'):
        self.model_cls = model_cls
        self.model_params = model_params
        self.name = name
        self.models = []
        self.oof_preds = None
        self.eval_metric = eval_metric
        self.eval_metric_sklearn = eval_metric_sklearn

    def fit(self, X, y, folds=5):
        start_time = time.time()
        logger.info(f"Starting training of {self.name} model with {folds} folds")
        
        self.oof_preds = np.zeros(len(X))
        self.models = []
        kf = KFold(n_splits=folds, shuffle=True, random_state=42)

        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            fold_start = time.time()
            logger.info(f"Training {self.name} - Fold {fold}/{folds}")
            
            X_train = X.iloc[train_idx]
            X_val   = X.iloc[val_idx]
            y_train = y.iloc[train_idx]
            y_val = y.iloc[val_idx]

            model = self.model_cls(**self.model_params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=self.eval_metric_sklearn, 
                      callbacks=[
                        lgb.early_stopping(stopping_rounds=100),
                        lgb.log_evaluation(period=100)
                    ])
            self.oof_preds[val_idx] = model.predict(X_val)

            # Calculate oof fold validation using RMSE
            fold_rmse = self.eval_metric(y_val, self.oof_preds[val_idx])
            logger.info(f"Fold {fold} RMSLE: {fold_rmse:.4f}")

            self.models.append(model)
            
            fold_time = time.time() - fold_start
            logger.info(f"Completed {self.name} - Fold {fold}/{folds} in {fold_time:.2f} seconds")

        total_time = time.time() - start_time
        
        # Calculate oof validation using RMSE
        rmse = self.eval_metric(y, self.oof_preds)
        logger.info(f"Out-of-fold RMSLE: {rmse:.4f}")

        logger.info(f"Completed training of {self.name} model in {total_time:.2f} seconds")

    def predict(self, X):
        preds = [model.predict(X) for model in self.models]
        return np.mean(np.column_stack(preds), axis=1)

    def retrain_full(self, X, y):
        start_time = time.time()
        logger.info(f"Starting full retraining of {self.name} model")
        
        model = self.model_cls(**self.model_params)
        model.fit(X, y)
        self.models = [model]
        
        total_time = time.time() - start_time
        logger.info(f"Completed full retraining of {self.name} model in {total_time:.2f} seconds")

# Data preprocessing

In [5]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

# Load data
if RUN_ON_KAGGLE:
    df_train = pd.read_csv('/kaggle/input/dataset/train.csv')
    df_test = pd.read_csv('/kaggle/input/dataset/test.csv')
else:
    df_train = pd.read_csv('data/train.csv')
    df_test = pd.read_csv('data/test.csv')

# Transform target (Calories) with log1p
df_train['Calories'] = np.log1p(df_train['Calories'])

def feature_engineering(df):
    # Create separate columns for male and female
    df['Sex_Male'] = (df['Sex'] == 'male').astype(int)
    df['Sex_Female'] = (df['Sex'] == 'female').astype(int)
    df = df.drop('Sex', axis=1)  # Drop original Sex column

    # Add BMI as a feature by dividing weight by height/100 squared, normalized per gender
    df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)
    
    # Normalize BMI within each gender group
    df['BMI_Normalized'] = df.groupby(['Sex_Male', 'Sex_Female'])['BMI'].transform(lambda x: (x - x.mean()) / x.std())

    # Encode obesity levels based on BMI
    df['BMI_Category'] = pd.cut(df['BMI'], 
                               bins=[0, 16.5, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')],
                               labels=[0, 1, 2, 3, 4, 5, 6])
    

    # Exercise intensity (heart rate / duration)
    df['Exercise_Intensity'] = df['Heart_Rate'] / df['Duration']

    # Heart rate duration
    df['Heart_Rate_Duration'] = df['Heart_Rate'] * df['Duration']

    # Temperature duration interaction
    df['Temp_Duration'] = df['Body_Temp'] * df['Duration']

    # HR divided by temp
    df['HR_div_Temp'] = df['Heart_Rate'] / df['Body_Temp']

    # Weight duration interaction
    df['Weight_Duration'] = df['Weight'] * df['Duration']

    # Max heart rate (220 - Age)
    df['Max_Heart_Rate'] = 220 - df['Age']

    # Heart rate intensity (heart rate / max heart rate)
    df['Heart_Rate_Intensity'] = df['Heart_Rate'] / df['Max_Heart_Rate']

    # Group age into bins
    df['Age_Bins'] = pd.cut(df['Age'], bins=[0, 20, 35, 50, 100], labels=[1, 2, 3, 4])

    # Get heart rate zones
    # Zone 1	Very Light	50–60%
    # Zone 2	Light	60–70%
    # Zone 3	Moderate	70–80%
    # Zone 4	Hard	80–90%
    # Zone 5	Maximum	90-100%
    # Heart rate zone is a percentage of max heart rate
    df['HR_Zone'] = pd.cut(df['Heart_Rate_Intensity'] * 100,
                          bins=[0, 50, 60, 70, 80, 90, 100],
                          labels=[0, 1, 2, 3, 4, 5])
    

    # Calculate BMR using Mifflin-St Jeor equation with gender-specific constant
    df['BMR'] = (10 * df['Weight'] + 
                 6.25 * df['Height'] - 
                 5 * df['Age'] +
                 5 * df['Sex_Male'] - 
                 161 * df['Sex_Female'])
    
    # Add log transformations for skewed features
    skewed_feats = ['Age', 'Weight', 'Body_Temp', 'Height', 'Duration', 'Heart_Rate']
    for feat in skewed_feats:
        df[f'Log_{feat}'] = np.log1p(df[feat])

    return df

# Apply feature engineering to both train and test datasets
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

# Prepare features and target
X_train = df_train.drop(['Calories', 'id'], axis=1)
y_train = df_train['Calories']

# Standard scale X_train
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Polynomial features on X_train
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train = poly.fit_transform(X_train)

# Put it back into a dataframe
X_train = pd.DataFrame(X_train, columns=poly.get_feature_names_out())

print(f"X has {X_train.shape[1]} features")

X has 405 features


In [8]:
import lightgbm as lgb

lgbm_model = BaseModelWrapper(
    lgb.LGBMRegressor,
    {
        'objective': 'regression',
        'seed': 42,
        'n_estimators': 3000,
        'learning_rate': 0.022,
        'max_depth': -1,
        'subsample': 0.9,
        'colsample_bytree': 0.5,
        'min_child_weight': 1,
        'verbose': 1,
        'device': 'gpu'
    },
    eval_metric=rmse,
    eval_metric_sklearn=rmse_sklearn,
    name='lgbm'
)

lgbm_model.fit(X_train, y_train, folds=3)

2025-05-13 19:14:11,328 - INFO - Starting training of lgbm model with 3 folds
2025-05-13 19:14:11,363 - INFO - Training lgbm - Fold 1/3
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 83748
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 405
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 405 dense feature groups (194.55 MB) transferred to GPU in 0.182526 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 4.141365
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.0168854	valid_0's rmse: 0.129944
[200]	valid_0's l2: 0.00420952	valid_0's rmse: 0.0648808
[300]	valid_0's l2: 0.00381568	valid_0's rmse: 0.0617712
[400]	valid_0's l2: 0.00375203	valid_0's rmse: 0.0612538
[500]

In [None]:
{
    'objective': 'regression',
    'seed': 42,
    'n_estimators': 3000,
    'learning_rate': 0.022,
    'max_depth': -1,
    'subsample': 0.9,
    'colsample_bytree': 0.5,
    'min_child_weight': 1,
    'verbose': 1,
    'device': 'gpu'
},

run 1: 0.0604865
run 2: 0.0595551
run 3: 0.0601071

oof 0.0601

In [9]:
# Scale oof preds with expm1 and save to npy file
scaled_preds = np.expm1(lgbm_model.oof_preds)

if RUN_ON_KAGGLE:
    np.save('lgbm2_oof_preds.npy', scaled_preds)
else:
    np.save('data/ensemble/lgbm2_oof_preds.npy', scaled_preds)


In [None]:
# Plot feature importances
import matplotlib.pyplot as plt

# Get feature names and importances
feature_names = X_train.columns
importances = lgbm2_model.models[0].feature_importances_

n_cols = 22

# Sort features by importance and take top 20
indices = np.argsort(importances)[::-1][:n_cols]

# Plot
plt.figure(figsize=(10,6))
plt.title(f'Top {n_cols} Feature Importances')
plt.bar(range(n_cols), importances[indices])
plt.xticks(range(n_cols), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
[feature_names[i] for i in np.argsort(importances)[::-1]]

In [None]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test_without_id = df_test.drop('id', axis=1)

X_test = scaler.transform(df_test_without_id)

X_test = poly.transform(X_test)

# Put it back into a dataframe
X_test = pd.DataFrame(X_test, columns=poly.get_feature_names_out())

# Make predictions on test data
test_preds = lgbm_model.predict(X_test)

# Scale test preds with expm1
scaled_test_preds = np.expm1(test_preds)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': scaled_test_preds
})

# Save submission file
if RUN_ON_KAGGLE:
    submission.to_csv('lgbm2_submission.csv', index=False)
else:
    submission.to_csv('data/ensemble/lgbm2_submission.csv', index=False)