In [2]:
import os
os.chdir('../..')

In [3]:
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Avoid duplicate handlers
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)  # stdout works better than stderr in Jupyter
    handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

SEED = 42 

In [4]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import numpy as np
import time
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Metrics
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmsle(y_true, y_pred):
    y_pred = np.clip(y_pred, 0, None)
    y_true = np.clip(y_true, 0, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Linear model wrapper
class LinearModelWrapper:
    def __init__(self, model_cls=Ridge, model_params=None, metric_fn=rmsle, name="ridge_lr"):
        self.model_cls = model_cls
        self.model_params = model_params
        self.metric_fn = metric_fn
        self.name = name
        self.models = []
        self.oof_preds = None
        self.float_type = np.float64

    def fit(self, X, y, folds=5):
        X = X.astype(self.float_type)
        y = y.values.astype(self.float_type) if hasattr(y, 'values') else y.astype(self.float_type)

        self.oof_preds = np.zeros(len(X), dtype=self.float_type)
        self.models = []
        kf = KFold(n_splits=folds, shuffle=True, random_state=SEED)

        logger.info(f"Training {self.name} with {folds}-fold CV")
        start_time = time.time()

        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            model = self.model_cls(**self.model_params)
            model.fit(X_train, y_train)

            preds = model.predict(X_val)
            self.oof_preds[val_idx] = preds
            fold_score = self.metric_fn(y_val, preds)
            logger.info(f"Fold {fold} {self.metric_fn.__name__.upper()}: {fold_score:.4f}")
            self.models.append(model)

        total_score = self.metric_fn(y, self.oof_preds)
        logger.info(f"OOF {self.metric_fn.__name__.upper()}: {total_score:.4f}")
        logger.info(f"Finished training {self.name} in {time.time() - start_time:.2f}s")

    def predict(self, X):
        X = X.astype(self.float_type)
        preds = [model.predict(X) for model in self.models]
        return np.mean(np.column_stack(preds), axis=1)


In [21]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures


df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Transform target (Calories) with log1p
# df_train['Calories'] = np.log1p(df_train['Calories'])

def feature_engineering(df):
    # Create separate columns for male and female
    df['Sex_Male'] = (df['Sex'] == 'male').astype(int)
    df['Sex_Female'] = (df['Sex'] == 'female').astype(int)
    df = df.drop('Sex', axis=1)  # Drop original Sex column

    # Add BMI as a feature by dividing weight by height/100 squared, normalized per gender
    df['BMI'] = df['Weight'] / ((df['Height']/100) ** 2)
    
    # Normalize BMI within each gender group
    df['BMI_Normalized'] = df.groupby(['Sex_Male', 'Sex_Female'])['BMI'].transform(lambda x: (x - x.mean()) / x.std())

    # Encode obesity levels based on BMI
    df['BMI_Category'] = pd.cut(df['BMI'], 
                               bins=[0, 16.5, 18.5, 24.9, 29.9, 34.9, 39.9, float('inf')],
                               labels=[0, 1, 2, 3, 4, 5, 6])
    

    # Exercise intensity (heart rate / duration)
    df['Exercise_Intensity'] = df['Heart_Rate'] / df['Duration']

    # Heart rate duration
    df['Heart_Rate_Duration'] = df['Heart_Rate'] * df['Duration']

    # Temperature duration interaction
    df['Temp_Duration'] = df['Body_Temp'] * df['Duration']

    # HR divided by temp
    df['HR_div_Temp'] = df['Heart_Rate'] / df['Body_Temp']

    # Weight duration interaction
    df['Weight_Duration'] = df['Weight'] * df['Duration']

    # Max heart rate (220 - Age)
    df['Max_Heart_Rate'] = 220 - df['Age']

    # Heart rate intensity (heart rate / max heart rate)
    df['Heart_Rate_Intensity'] = df['Heart_Rate'] / df['Max_Heart_Rate']

    # Group age into bins
    df['Age_Bins'] = pd.cut(df['Age'], bins=[0, 20, 35, 50, 100], labels=[1, 2, 3, 4])

    # Get heart rate zones
    # Zone 1	Very Light	50–60%
    # Zone 2	Light	60–70%
    # Zone 3	Moderate	70–80%
    # Zone 4	Hard	80–90%
    # Zone 5	Maximum	90-100%
    # Heart rate zone is a percentage of max heart rate
    df['HR_Zone'] = pd.cut(df['Heart_Rate_Intensity'] * 100,
                          bins=[0, 50, 60, 70, 80, 90, 100],
                          labels=[0, 1, 2, 3, 4, 5])
    

    # Calculate BMR using Mifflin-St Jeor equation with gender-specific constant
    df['BMR'] = (10 * df['Weight'] + 
                 6.25 * df['Height'] - 
                 5 * df['Age'] +
                 5 * df['Sex_Male'] - 
                 161 * df['Sex_Female'])
    
    # Add log transformations for skewed features
    skewed_feats = ['Age', 'Weight', 'Body_Temp', 'Height', 'Duration', 'Heart_Rate']
    for feat in skewed_feats:
        df[f'Log_{feat}'] = np.log1p(df[feat])

    return df

# Apply feature engineering to both train and test datasets
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

# Prepare features and target
X_train = df_train.drop(['Calories', 'id'], axis=1)
y_train = df_train['Calories']

# Standard scale X_train
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Polynomial features on X_train
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train = poly.fit_transform(X_train)

# Put it back into a dataframe
X_train = pd.DataFrame(X_train, columns=poly.get_feature_names_out())

print(f"X has {X_train.shape[1]} features")

X has 405 features


In [28]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.compose import TransformedTargetRegressor

linear_model = LinearModelWrapper(
    model_cls=TransformedTargetRegressor,
    model_params={'regressor': LinearRegression(), 'func': np.log1p, 'inverse_func': np.expm1},
    metric_fn=rmsle,
    name='TTLR'
)

linear_model.fit(X_train.values, y_train, folds=5)

2025-05-11 21:47:30,868 - INFO - Training TTLR with 5-fold CV


INFO:__main__:Training TTLR with 5-fold CV


2025-05-11 21:48:03,405 - INFO - Fold 1 RMSLE: 0.0603


INFO:__main__:Fold 1 RMSLE: 0.0603


KeyboardInterrupt: 

In [7]:
np.save('lr1_oof_preds.npy', linear_model.oof_preds)

array([145.65801656,  36.06555165,  28.99464365, ..., 234.30580657,
       106.72924764, 100.13956455], shape=(750000,))

In [8]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test = df_test.drop('id', axis=1)

X_test = scaler.transform(df_test)

X_test = poly.transform(X_test)

# Make predictions on test data
test_preds = linear_model.predict(X_test)

# Create submission file
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': test_preds
})

# Save submission file
submission.to_csv('data/ensemble/lr1_submission.csv', index=False)