In [2]:
import os
os.chdir('..')

In [3]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)

train_df = pd.read_csv("data/train.csv")
y_target = train_df.Calories.values
train_df.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [4]:
files = []
X_train = []
submission_preds_files = []
PATH = "data/ensemble/"

TRANSFORM_TARGET = False

print("Loading files...")
for c in ['xgb1','lgbm1', 'nn1', 'knn1', 'lr1']:
    print(f"=> {c} ",end="")
    oof_preds = np.load(f"{PATH}{c}_oof_preds.npy")

    # IF NOT LOG1P THEN APPLY LOG1P
    if TRANSFORM_TARGET:
        if oof_preds.mean() > 10: 
            oof_preds = np.log1p(oof_preds)

    X_train.append(oof_preds)
    files.append(f"oof_{c}")


    submission_preds = pd.read_csv(f"{PATH}{c}_submission.csv")

    if TRANSFORM_TARGET:
        if submission_preds['Calories'].mean() > 10:
            submission_preds['Calories'] = np.log1p(submission_preds['Calories'])

    submission_preds_files.append(submission_preds)

Loading files...
=> xgb1 => lgbm1 => nn1 => knn1 => lr1 

In [5]:
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_squared_log_error
from scipy.optimize import minimize

def rmsle(y_true, y_pred):
    """Calculates Root Mean Squared Logarithmic Error safely."""
    y_pred_clipped = np.maximum(y_pred, 0)
    y_true_clipped = np.maximum(y_true, 0)
    return np.sqrt(mean_squared_log_error(y_true_clipped, y_pred_clipped))

class MinimizeEnsembler:
    def __init__(self,
                 evaluation_metric=rmsle,
                 constraints=True,
                 allow_negative_weights=False):
        """
        Parameters:
        - evaluation_metric: function, e.g., mean_squared_error
        - constraints: whether to constrain weights to sum to 1
        - allow_negative_weights: whether weights < 0 are allowed
        """
        self.evaluation_metric = evaluation_metric
        self.constraints = constraints
        self.allow_negative_weights = allow_negative_weights

        self.weights = None
        self.performance_score = None
        self.n_models = None

    def fit(self, oof_predictions, y_true):
        """
        Fit optimal weights using scipy.optimize.minimize.

        Parameters:
        - oof_predictions: list of numpy arrays (each shape: (n_samples,))
        - y_true: numpy array of shape (n_samples,)
        """
        y_true = np.array(y_true)
        prediction_matrix = np.column_stack(oof_predictions)  # shape: (n_samples, n_models)
        self.n_models = prediction_matrix.shape[1]

        def objective(weights):
            ensemble_pred = prediction_matrix @ weights
            return self.evaluation_metric(y_true, ensemble_pred)

        # Initial guess: uniform weights summing to 1
        initial_weights = np.full(self.n_models, 1 / self.n_models)

        # Constraints
        constraints = []
        if self.constraints:
            constraints.append({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})

        # Bounds
        if self.allow_negative_weights:
            bounds = None  # unconstrained
        else:
            bounds = [(0, 1) for _ in range(self.n_models)]

        # Optimize
        result = minimize(
            fun=objective,
            x0=initial_weights,
            bounds=bounds,
            constraints=constraints,
            method='SLSQP',
            options={'disp': False, 'ftol': 1e-8}
        )

        # Store result
        self.weights = result.x
        self.performance_score = result.fun

        print(f"Optimization successful: {result.success}")
        print(f"Final score: {self.performance_score:.5f}")
        print(f"Selected {np.sum(self.weights > 0):d} models with non-zero weight.")

    def predict(self, prediction_list):
        """
        Predict using optimized ensemble weights.

        Parameters:
        - prediction_list: list of np.arrays, shape (n_samples,) each

        Returns:
        - np.array: ensemble prediction
        """
        if self.weights is None:
            raise ValueError("Ensemble is not fitted yet.")
        prediction_matrix = np.column_stack(prediction_list)
        return prediction_matrix @ self.weights # matrix multiply

    def get_ensemble_info(self):
        """
        Return the final weights and performance.

        Returns:
        - dict with weights, metric score, and constraint settings
        """
        return {
            "weights": self.weights,
            "final_score": self.performance_score,
            "constraints": self.constraints,
            "allow_negative_weights": self.allow_negative_weights,
            "evaluation_metric": self.evaluation_metric.__name__ if hasattr(self.evaluation_metric, '__name__') else str(self.evaluation_metric),
        }


In [6]:
ensemble = MinimizeEnsembler()
ensemble.fit(X_train, y_target)

Optimization successful: True
Final score: 0.05966
Selected 5 models with non-zero weight.


In [11]:
ensemble.get_ensemble_info()

{'weights': array([1.68231327e-01, 1.23878763e-02, 6.62390687e-01, 3.18679221e-04,
        1.56671430e-01]),
 'final_score': np.float64(0.05525657000196969),
 'constraints': True,
 'allow_negative_weights': False,
 'evaluation_metric': 'rmsle'}

In [12]:
# Get predictions from each model
pred_arrays = [df['Calories'].values for df in submission_preds_files]

# Generate ensemble prediction
y_pred = ensemble.predict(pred_arrays)

# Inverse transform using expm1
if TRANSFORM_TARGET:
    y_pred = np.expm1(y_pred)

# Create submission dataframe using original ids
submission = pd.DataFrame({
    'id': submission_preds_files[0]['id'].values,  # Use ids from first prediction dataframe
    'Calories': y_pred
})

print("Generated predictions for", len(y_pred), "samples")
print("\nFirst few predictions:")
print(submission.head())

# Save submission file
submission.to_csv('submissions/lgbm_xgb_nn_knn_lr_submission.csv', index=False)


Generated predictions for 250000 samples

First few predictions:
       id    Calories
0  750000   27.406036
1  750001  108.163807
2  750002   87.013523
3  750003  125.315315
4  750004   76.157769
