In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Set working directory to the project root (1 folder up)
import os
os.chdir('..')

In [5]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Transform Sex to 0 and 1
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

# Transform target (Calories) with log1p
df_train['Calories'] = np.log1p(df_train['Calories'])

# Create column interactions between all numerical columns
numeric_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

# Create interaction features
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        col1, col2 = numeric_cols[i], numeric_cols[j]
        interaction_name = f'{col1}_{col2}_interaction'
        df_train[interaction_name] = df_train[col1] * df_train[col2]
        df_test[interaction_name] = df_test[col1] * df_test[col2]

# Prepare features and target
X_train = df_train.drop(['Calories', 'id'], axis=1)
y_train = df_train['Calories']

In [None]:
# I want to implement a ridge stack ensemble. So i need to split the data into 3 parts:
# 1. Training data for layer 1
# 2. Training data for layer 2
# 3. Validation data for the overall model

# Split the data into 3 parts
# First split: 50% train_1, 50% temp
X_train_1, X_temp, y_train_1, y_temp = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42, shuffle=True
)

# Second split: 60% of temp goes to train_2 and 40% to val (val is then 20% of total)
X_train_2, X_val, y_train_2, y_val = train_test_split(
    X_temp, y_temp, test_size=0.4, random_state=42, shuffle=True
)


# Layer 1 Model 1: LightGBM simple model

In [9]:
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 1024,
    'max_bin': 1024,
    'learning_rate': 0.02,
    'subsample': 0.8,
    'n_estimators': 1000,
    'verbose': 1
}

model = LGBMRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# Make predictions
val_preds = model.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print(f'Validation RMSE: {rmse}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.108482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14672
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 22
[LightGBM] [Info] Start training from score 4.141144
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.140323
[200]	valid_0's rmse: 0.0557788
[300]	valid_0's rmse: 0.050117
[400]	valid_0's rmse: 0.0478295
[500]	valid_0's rmse: 0.0461549
[600]	valid_0's rmse: 0.0448418
[700]	valid_0's rmse: 0.0436937
[800]	valid_0's rmse: 0.0427132
[900]	valid_0's rmse: 0.0418647
[1000]	valid_0's rmse: 0.0410542
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 0.0410542
Validation RMSE: 0.04105415415759354


# Layer 1 Model 2: Simle lasso regression

In [None]:
# Implement simple lasso regression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Scale the data that goes into the lasso model
scaler = StandardScaler()
X_train_1_scaled = scaler.fit_transform(X_train_1)
X_train_2_scaled = scaler.transform(X_train_2)
X_val_scaled = scaler.transform(X_val)

# Initialize the model
lasso_model = Lasso(alpha=0.005, max_iter=10000)

# Fit the model
lasso_model.fit(X_train_1_scaled, y_train_1)

# Make predictions
val_preds_lasso = lasso_model.predict(X_val_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_preds_lasso))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 0.17631685929929347


# Use both models to train a level 2 model: ridge regression

In [22]:
# Get predictions from both models on the x_train_2
preds_lgb = model.predict(X_train_2)
preds_lasso = lasso_model.predict(X_train_2_scaled)

# Combine predictions
combined_preds = np.column_stack((preds_lgb, preds_lasso))

combined_preds

array([[2.2972325 , 2.4800148 ],
       [5.47134224, 5.48737974],
       [5.39962912, 5.52400475],
       ...,
       [4.74103985, 4.53896626],
       [3.51465688, 3.27450753],
       [4.75884739, 4.28753538]], shape=(225000, 2))

In [24]:
# Implement ridge regression on the combined predictions and their targets
from sklearn.linear_model import Ridge

# Make a new scaler for the combined predictions   
scaler_combined = StandardScaler()
combined_preds_scaled = scaler_combined.fit_transform(combined_preds)

# Initialize the ridge regression model
ridge_model = Ridge(alpha=0.1)

# Fit the model
ridge_model.fit(combined_preds_scaled, y_train_2)

# Get results on the validation set

In [25]:
val_preds_lgb = model.predict(X_val)
val_preds_lasso = lasso_model.predict(X_val_scaled)

# Combine predictions
combined_preds_val = np.column_stack((val_preds_lgb, val_preds_lasso))

# Scale the combined predictions
combined_preds_val_scaled = scaler_combined.transform(combined_preds_val)

# Make predictions
val_preds_ridge = ridge_model.predict(combined_preds_val_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_preds_ridge))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 0.04093375815141997


# Generate submission

In [27]:
# Save the 'id' column before dropping it
test_ids = df_test['id']

# Drop 'id' column before prediction
df_test = df_test.drop('id', axis=1)

df_test_scaled = scaler.transform(df_test)

# Make predictions
sub_preds_lgb = model.predict(df_test)
sub_preds_lasso = lasso_model.predict(df_test_scaled)

# Combine predictions
combined_preds_test = np.column_stack((sub_preds_lgb, sub_preds_lasso))

# Scale the combined predictions
combined_preds_test_scaled = scaler_combined.transform(combined_preds_test)

# Make predictions
sub_preds_ridge = ridge_model.predict(combined_preds_test_scaled)

# Transform predictions back using exp1m
sub_preds_ridge_exp = np.expm1(sub_preds_ridge)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Calories': sub_preds_ridge_exp
})

# Save submission
submission.to_csv('submissions/ensemble_1.csv', index=False)