In [2]:
import os
os.chdir('..')

In [1]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Avoid duplicate handlers
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)  # stdout works better than stderr in Jupyter
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

In [4]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso, Ridge
import lightgbm as lgb
import time

class BaseModelWrapper:
    def __init__(self, model_cls, model_params, name, preprocessor=None):
        self.model_cls = model_cls
        self.model_params = model_params
        self.name = name
        self.preprocessor = preprocessor
        self.models = []
        self.oof_preds = None

    def _prep(self, X, fit=False):
        return self.preprocessor(X, fit=fit) if self.preprocessor else X

    def fit(self, X, y, folds=5):
        start_time = time.time()
        logger.info(f"Starting training of {self.name} model with {folds} folds")
        
        self.oof_preds = np.zeros(len(X))
        self.models = []
        kf = KFold(n_splits=folds, shuffle=True, random_state=42)

        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            fold_start = time.time()
            logger.info(f"Training {self.name} - Fold {fold}/{folds}")
            
            X_train = self._prep(X.iloc[train_idx], fit=True)
            X_val   = self._prep(X.iloc[val_idx], fit=False)
            y_train = y[train_idx]

            model = self.model_cls(**self.model_params)
            model.fit(X_train, y_train)
            self.oof_preds[val_idx] = model.predict(X_val)
            self.models.append(model)
            
            fold_time = time.time() - fold_start
            logger.info(f"Completed {self.name} - Fold {fold}/{folds} in {fold_time:.2f} seconds")

        total_time = time.time() - start_time
        logger.info(f"Completed training of {self.name} model in {total_time:.2f} seconds")

    def predict(self, X):
        X_proc = self._prep(X, fit=False)
        preds = [model.predict(X_proc) for model in self.models]
        return np.mean(np.column_stack(preds), axis=1)

    def retrain_full(self, X, y):
        start_time = time.time()
        logger.info(f"Starting full retraining of {self.name} model")
        
        X_proc = self._prep(X, fit=True)
        model = self.model_cls(**self.model_params)
        model.fit(X_proc, y)
        self.models = [model]
        
        total_time = time.time() - start_time
        logger.info(f"Completed full retraining of {self.name} model in {total_time:.2f} seconds")


class StackingEnsembler:
    def __init__(self, base_models, meta_model_cls, meta_model_params, meta_preprocessor=None):
        self.base_models = base_models
        self.meta_model_cls = meta_model_cls
        self.meta_model_params = meta_model_params
        self.meta_preprocessor = meta_preprocessor
        self.meta_model = None

    def _prep(self, X, fit=False):
        return self.meta_preprocessor(X, fit=fit) if self.meta_preprocessor else X
    
    def fit(self, X, y, folds=5):
        start_time = time.time()
        logger.info(f"Starting stacking ensemble training with {len(self.base_models)} base models")
        
        # Train base models and collect OOF predictions
        oof_features = []
        for i, model in enumerate(self.base_models, 1):
            logger.info(f"Training base model {i}/{len(self.base_models)}: {model.name}")
            model.fit(X, y, folds=folds)
            oof_features.append(model.oof_preds.reshape(-1, 1))

        logger.info("Training meta-model")
        meta_start = time.time()
        
        meta_X = np.hstack(oof_features)
        meta_X = self._prep(meta_X, fit=True)

        self.meta_model = self.meta_model_cls(**self.meta_model_params)
        self.meta_model.fit(meta_X, y)
        
        meta_time = time.time() - meta_start
        total_time = time.time() - start_time
        logger.info(f"Meta-model training completed in {meta_time:.2f} seconds")
        logger.info(f"Total stacking ensemble training completed in {total_time:.2f} seconds")

    def predict(self, X):
        logger.info("Generating predictions from stacking ensemble")
        start_time = time.time()
        
        base_preds = [model.predict(X).reshape(-1, 1) for model in self.base_models]
        meta_X = np.hstack(base_preds)
        meta_X = self._prep(meta_X, fit=False)
        predictions = self.meta_model.predict(meta_X)
        
        pred_time = time.time() - start_time
        logger.info(f"Predictions generated in {pred_time:.2f} seconds")
        return predictions 

In [5]:
from sklearn.preprocessing import StandardScaler

class StandardScalerPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.fitted = False

    def __call__(self, X, fit=False):
        if fit:
            X_scaled = self.scaler.fit_transform(X)
            self.fitted = True
        else:
            if not self.fitted:
                raise RuntimeError("Preprocessor not fitted")
            X_scaled = self.scaler.transform(X)
        return X_scaled

In [9]:
lgbm_model = BaseModelWrapper(
    lgb.LGBMRegressor,
    {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 1024,
        'max_bin': 1024,
        'learning_rate': 0.02,
        'subsample': 0.8,
        'n_estimators': 1000,
        'verbose': 1
    },
    name='lgbm'
)

lasso_model = BaseModelWrapper(
    Lasso,
    {'alpha': 0.005, 'max_iter': 10000},
    preprocessor=StandardScalerPreprocessor(),
    name='lasso'
)

# Now use them in wrappers
base_models = [
    lgbm_model,
    lasso_model,
]

stack = StackingEnsembler(
    base_models=base_models,
    meta_model_cls=Ridge,
    meta_model_params={'alpha': 0.1},
    meta_preprocessor=StandardScalerPreprocessor()
)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Transform Sex to 0 and 1
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Transform target (Calories) with log1p
df_train['Calories'] = np.log1p(df_train['Calories'])

# Create column interactions between all numerical columns
numeric_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

# Create interaction features
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        col1, col2 = numeric_cols[i], numeric_cols[j]
        interaction_name = f'{col1}_{col2}_interaction'
        df_train[interaction_name] = df_train[col1] * df_train[col2]
        df_test[interaction_name] = df_test[col1] * df_test[col2]

# Prepare features and target
X_train = df_train.drop(['Calories', 'id'], axis=1)
y_train = df_train['Calories']

# Split data into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Train level 2 model
stack.fit(X_train, y_train, folds=5)

# Retrain base models on full data
stack.retrain_base_models(X_train, y_train)

In [None]:
# Predict on validation set
from sklearn.metrics import mean_squared_error

y_val_pred = stack.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_val_pred)
print(f"RMSE on validation set: {rmse:.4f}")

In [None]:
# Create submission file

# Save ids from test set
test_ids = df_test['id']

# Remove id column from test set
df_test = df_test.drop('id', axis=1)

# Predict on test set
y_test_pred = stack.predict(df_test)

# Inverse transform predictions using expm1
y_test_pred = np.expm1(y_test_pred)

# Create submission file with ids and predictions
submission = pd.DataFrame({'id': test_ids, 'Calories': y_test_pred})
submission.to_csv('submission.csv', index=False)