In [None]:
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from src.regressor import SpaceBoostingRegressor
from sklearn.preprocessing import StandardScaler

import numpy as np
import lightgbm as lgb

In [14]:
# Create the dataset using make_regression
np.random.seed(42)
X, y = make_regression(n_samples=20000, n_features=20, n_informative=20, noise=10)

# Add outliers to make the problem more challenging
n_outliers = 50
outliers_X = np.random.uniform(low=-10, high=10, size=(n_outliers, X.shape[1]))
outliers_y = np.random.uniform(low=-100, high=100, size=n_outliers)
X = np.vstack([X, outliers_X])
y = np.hstack([y, outliers_y])

# Add non-linear, discrete effects to the target variable
y += np.where(X[:, 0] > 1, 20, -10)  # If the 0th feature is greater than 1, add 20 to y, otherwise -10
y += np.sin(X[:, 1]) * 5  # Add a non-linear effect based on the sine of the 1st feature
y += np.log(np.abs(X[:, 2]) + 1) * 3  # Add a non-linear effect based on the logarithm of the 2nd feature
y += np.where((X[:, 3] > 0.2) & (X[:, 4] < 0.4), 15, -7)  # Conditional effect based on the 3rd and 4th features
y += np.power(X[:, 5], 2) * 0.5  # Add a quadratic effect based on the 5th feature
y += np.random.choice([1, -1], size=y.shape) * (X[:, 6] > 0).astype(int) * 12  # Random discrete effect based on the 6th feature

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
tree = lgb.LGBMRegressor(verbose = -1, random_state=42)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse:.2f}")

predictions = tree.predict(X_train)
mse = mean_squared_error(y_train, predictions)
print(f"MSE: {mse:.2f}")

MSE: 5880.94
MSE: 1951.61


In [23]:
tree = SpaceBoostingRegressor(random_state=42)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"MSE: {mse}")

predictions = tree.predict(X_train)
mse = mean_squared_error(y_train, predictions)
print(f"MSE: {mse}")


MSE: 997.2054348262876
MSE: 168.73180086560603
