TODO:

1. Add validation set for early stopping/eval
2. Figure out way to get quicker convergence.

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Generate input data
np.random.seed(42)
n_samples = 1000
x = np.linspace(0, 10, n_samples)


# Create correlated noise with increasing magnitude
def scale_cov_matrix(x, base_cov, scale_factor=0.1):
    return base_cov * (1 + scale_factor * x)


cov_matrix = np.array([[0.25, 0.15, 0.1], [0.15, 0.49, 0.2], [0.1, 0.2, 0.36]])

correlated_noise = np.array(
    [
        np.random.multivariate_normal(
            mean=[0, 0, 0], cov=scale_cov_matrix(xi, cov_matrix)
        )
        for xi in x
    ]
)

# Generate three dependent variables with smooth, non-monotonic relationships and correlated noise
y1_ = 2 * np.sin(x) + 0.5 * x
y1 = y1_ + correlated_noise[:, 0]
y2_ = 3 * np.cos(0.5 * x) + 0.3 * x**2
y2 = y2_ + correlated_noise[:, 1]
y3_ = 1.5 * np.sin(0.7 * x) * np.cos(0.3 * x) + 0.2 * x
y3 = y3_ + correlated_noise[:, 2]
# Combine the data
data = np.column_stack((x, y1, y2, y3))
real_data = np.column_stack((x, y1_, y2_, y3_))

# Plot the relationships
fig, axs = plt.subplots(3, 1, figsize=(10, 10))
fig.suptitle("Relationships between input and outputs (with correlated noise)")

for i, ax in enumerate(axs):
    ax.scatter(x, data[:, i + 1], alpha=0.5)
    ax.set_xlabel("Input")
    ax.set_ylabel(f"Output {i+1}")

plt.tight_layout()
plt.show()

print("Data shape:", data.shape)
print("First few rows of the data:")
print(data[:5])

# Plot correlation matrix of the outputs
correlation_matrix = np.corrcoef(data[:, 1:].T)
plt.figure(figsize=(8, 6))
plt.imshow(correlation_matrix, cmap="coolwarm", vmin=-1, vmax=1)
plt.colorbar()
plt.title("Correlation Matrix of Outputs")
plt.xticks(range(3), ["y1", "y2", "y3"])
plt.yticks(range(3), ["y1", "y2", "y3"])
for i in range(3):
    for j in range(3):
        plt.text(j, i, f"{correlation_matrix[i, j]:.2f}", ha="center", va="center")
plt.tight_layout()
plt.show()

In [None]:
from lightgbmlss.model import LightGBMLSS
from lightgbmlss.utils import create_mv_dataset
from lightgbmlss.distributions.Gaussian import MultivariateGaussian

# Create lightgbm dataset
dtrain = create_mv_dataset(
    data[:, 0].reshape(-1, 1),
    data[:, 1:],
)

# Create lightgbm lss model and train
lgblss = LightGBMLSS(MultivariateGaussian(n_dim=3, response_fn="exp"))
lgblss.train(params={"learning_rate": 0.1, "num_iterations": 1000, "lambda_l2": 10, "lambda_l1": 10, "num_leaves": 2}, train_set=dtrain)

# Make predictions
preds = lgblss.predict(data[:, 0].reshape(-1, 1))
preds


In [None]:
from lightgbmlss.model import LightGBMLSS
from lightgbmlss.utils import create_mv_dataset
from lightgbmlss.distributions.Gaussian import MultivariateGaussian

# Create lightgbm dataset
dtrain = create_mv_dataset(
    data[:, 0].reshape(-1, 1),
    data[:, 1:],
)

param_dict = {
    "eta": ["float", {"low": 1e-5, "high": 1, "log": True}],
    "max_depth": ["int", {"low": 1, "high": 10, "log": False}],
    "num_leaves": [
        "int",
        {"low": 255, "high": 255, "log": False},
    ],  # set to constant for this example
    "min_data_in_leaf": [
        "int",
        {"low": 20, "high": 20, "log": False},
    ],  # set to constant for this example
    "min_gain_to_split": ["float", {"low": 1e-8, "high": 40, "log": False}],
    "min_sum_hessian_in_leaf": ["float", {"low": 1e-8, "high": 500, "log": True}],
    "subsample": ["float", {"low": 0.2, "high": 1.0, "log": False}],
    "feature_fraction": ["float", {"low": 0.2, "high": 1.0, "log": False}],
    "boosting": ["categorical", ["gbdt"]],
}

# Create lightgbm lss model and train
lgblss = LightGBMLSS(MultivariateGaussian(n_dim=3, response_fn="exp"))
opt_param = lgblss.hyper_opt(
    param_dict,
    dtrain,
    num_boost_round=100,  # Number of boosting iterations.
    nfold=5,  # Number of cv-folds.
    early_stopping_rounds=20,  # Number of early-stopping rounds
    max_minutes=10,  # Time budget in minutes, i.e., stop study after the given number of minutes.
    n_trials=30,  # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
    silence=True,  # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
    seed=123,  # Seed used to generate cv-folds.
    hp_seed=123,  # Seed for random number generator used in the Bayesian hyperparameter search.
)



In [None]:
from lightgbmlss.model import LightGBMLSS
from lightgbmlss.utils import create_mv_dataset
from lightgbmlss.distributions.Gaussian import MultivariateGaussian

# Create lightgbm dataset
dtrain = create_mv_dataset(
    data[:, 0].reshape(-1, 1),
    data[:, 1:],
)

param_dict = {
    "lambda_l1": ["float", {"low": 1e-8, "high": 10.0, "log": True}],
    "lambda_l2": ["float", {"low": 1e-8, "high": 100.0, "log": True}],
    "num_leaves": [
        "int",
        {"low": 2, "high": 255, "log": False},
    ],  # set to constant for this example
    "min_child_samples": ["int", {"low": 5, "high": 100, "log": False}],
    "eta": ["float", {"low": 1e-5, "high": 1, "log": True}],
    # "num_iterations": ["int", {"low": 100, "high": 1000, "log": False}],
    # "feature_fraction": ["float", {"low": 0.4, "high": 1.0, "log": False}],
    # "bagging_fraction": ["float", {"low": 0.4, "high": 1.0, "log": False}],
    # "bagging_freq": ["int", {"low": 1, "high": 7, "log": False}],
    "feature_pre_filter": ["bool", False],
}

# Create lightgbm lss model and train
lgblss = LightGBMLSS(MultivariateGaussian(n_dim=3, response_fn="exp"))
opt_param = lgblss.hyper_opt(
    param_dict,
    dtrain,
    num_boost_round=100,  # Number of boosting iterations.
    nfold=5,  # Number of cv-folds.
    early_stopping_rounds=20,  # Number of early-stopping rounds
    max_minutes=10000,  # Time budget in minutes, i.e., stop study after the given number of minutes.
    n_trials=30,  # The number of trials. If this argument is set to None, there is no limitation on the number of trials.
    silence=True,  # Controls the verbosity of the trail, i.e., user can silence the outputs of the trail.
    seed=123,  # Seed used to generate cv-folds.
    hp_seed=123,  # Seed for random number generator used in the Bayesian hyperparameter search.
)



In [None]:
opt_param

In [None]:
opt_param

In [None]:
from lightgbmlss.model import LightGBMLSS
from lightgbmlss.utils import create_mv_dataset
from lightgbmlss.distributions.Gaussian import MultivariateGaussian

# Create lightgbm dataset
dtrain = create_mv_dataset(
    data[:, 0].reshape(-1, 1),
    data[:, 1:],
)

# Create lightgbm lss model and train
lgblss = LightGBMLSS(MultivariateGaussian(n_dim=3, response_fn="exp"))
lgblss.train(params={**opt_param, "num_iterations": 1000}, train_set=dtrain)

# Make predictions
preds = lgblss.predict(data[:, 0].reshape(-1, 1))
preds


In [None]:
import torch
from torch.distributions.multivariate_normal import MultivariateNormal

dist = MultivariateNormal(
    loc=torch.tensor(preds.iloc[:, :3].to_numpy()),
    scale_tril=torch.tensor(preds.iloc[:, 3:].to_numpy().reshape(-1, 3, 3, order="C")),
)
loss = -torch.nansum(dist.log_prob(torch.tensor(data[:, 1:])))
loss


In [None]:


dist = MultivariateNormal(
    loc=torch.tensor(real_data[:, 1:]),
    covariance_matrix=torch.tensor(np.tile(cov_matrix, (n_samples, 1, 1))),
)
loss = -torch.nansum(dist.log_prob(torch.tensor(data[:, 1:])))
loss

In [7]:
def get_loss(params):

    # Create lightgbm lss model and train
    lgblss = LightGBMLSS(MultivariateGaussian(n_dim=3))
    lgblss.train(params=params, train_set=dtrain)
    preds = lgblss.predict(data[:, 0].reshape(-1, 1))
    dist = MultivariateNormal(
        loc=torch.tensor(preds.iloc[:, :3].to_numpy()),
        scale_tril=torch.tensor(preds.iloc[:, 3:].to_numpy().reshape(-1, 3, 3, order="C")),
    )
    loss = -torch.nansum(dist.log_prob(torch.tensor(data[:, 1:])))
    return loss
# print(get_loss({"learning_rate": 1e-10, "num_iterations": 1}))
# print(get_loss({"learning_rate": 0.01, "num_iterations": 10}))
# print(get_loss({"learning_rate": 0.01, "num_iterations": 20}))
# print(get_loss({"learning_rate": 0.01, "num_iterations": 50}))
# print(get_loss({"learning_rate": 0.01, "num_iterations": 100}))
# print(get_loss({"learning_rate": 0.1, "num_iterations": 1000}))



In [None]:
fig, axes = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
fig.suptitle("LightGBMLSS MV Gaussian Predictions", fontsize=16)

# Compute covariance matrices
st = preds.iloc[:, 3:].to_numpy().reshape(-1, 3, 3, order="C")
cov = np.zeros_like(st)
for i in range(st.shape[0]):
    cov[i] = st[i] @ st[i].T

for i in range(3):
    ax = axes[i]
    
    mean = preds[f"loc_{i}"]
    std = np.sqrt(cov[:, i, i])
    lower = mean - 1.96 * std
    upper = mean + 1.96 * std
    
    ax.plot(data[:, [0]], mean, label="Mean")
    ax.fill_between(data[:, 0], lower, upper, alpha=0.3, label="95% CI")
    ax.scatter(data[:, 0], data[:, i+1], alpha=0.5, label="Data")
    ax.set_ylabel(f"Y{i+1}")
    ax.legend()
    ax.grid(True)

axes[-1].set_xlabel("Input")
plt.tight_layout()
plt.show()

In [None]:
data[999, :]

## Gradient and Hessian analysis

To get the data analyzed below, I saved the gradient and hessian values for every point, iteration, parameter combination through a training of the model


For more details on how lightgbm calculates gain from each split using gradients and hessians:

* [XGBoost paper](https://arxiv.org/pdf/1603.02754)
* [Code line where gain is calculated](https://github.com/microsoft/LightGBM/blob/d2b4e7374957e0d05a3a7d5ec695940287d4dc36/src/treelearner/feature_histogram.hpp#L808)

In [None]:
import pandas as pd
grad = pd.read_csv("grad_hess5.csv")
grad

In [None]:
tmp.pivot_table(index="iter", columns="cat", values="00", aggfunc="sum")

In [None]:
import matplotlib.pyplot as plt


pts = range(960, 1000)
tmp = grad.query("pt.isin(@pts) & iter < 1 & iter > -1")
fig, axes = plt.subplots(figsize=(12, 12), ncols=3, nrows=3, sharex=True, sharey=True)

for i, col in enumerate(col for col in tmp.columns if col not in ["pt", "iter", "cat"]):
    ax = axes[i // 3, i % 3]
    curr = tmp.pivot_table(index="iter", columns="cat", values=col, aggfunc="sum")
    curr["ratio"] = curr["grad"] ** 2 / (curr["hess"] + 1e-15) #* np.sign(curr["grad"])
    curr.plot(ax=ax, marker="o")
    ax.set_title(col)
    ax.axhline(0, color="k", linestyle="--")

fig.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt


pts = range(960, 1000)
tmp = grad.query("pt.isin(@pts) & iter < 3 & iter > -1")
fig, axes = plt.subplots(figsize=(12, 12), ncols=3, nrows=3, sharex=True, sharey=False)

for i, col in enumerate(col for col in tmp.columns if col not in ["pt", "iter", "cat"]):
    ax = axes[i // 3, i % 3]
    curr = tmp.pivot_table(index="iter", columns="cat", values=col, aggfunc="sum")
    curr["ratio"] = curr["grad"] ** 2 / (curr["hess"] + 1e-15) * np.sign(curr["grad"])
    curr[["ratio"]].plot(ax=ax, marker="o")
    ax.set_title(col)
    # ax.axhline(0, color="k", linestyle="--")

fig.tight_layout()
plt.show()

Building torch distribution gradient intuition

In [None]:
from torch.distributions import Normal, MultivariateNormal
import torch
from torch.autograd import grad as autograd


loc = torch.tensor([1.], requires_grad=True)
scale = torch.tensor([1.], requires_grad=True)
target = torch.tensor([-5.])
loss = -torch.nansum(Normal(loc=loc, scale=scale).log_prob(target))

print(f"Point: {target.item():.2f}\nMean: {loc.item():.2f}, STD: {scale.item():.2f}\n")

grad_loc = autograd(loss, inputs=loc, create_graph=True)
hess_loc = autograd(grad_loc[0].nansum(), inputs=loc, retain_graph=True)
print(f"Mean Gradient: {grad_loc[0].item():.2f}")
print(f"Mean Hessian: {hess_loc[0].item():.2f}\n")
grad_scale = autograd(loss, inputs=scale, create_graph=True)
hess_scale = autograd(grad_scale[0].nansum(), inputs=scale, retain_graph=True)
print(f"STD Gradient: {grad_scale[0].item():.2f}")
print(f"STD Hessian: {hess_scale[0].item():.2f}")

In [None]:
35**2/107

In [None]:
from torch.distributions import Normal, MultivariateNormal
import torch
from torch.autograd import grad as autograd


loc = torch.tensor([2.87, 9.38, 1.42], requires_grad=True)
scale_tril = torch.tensor([[2.03, 0, 0], [6.25, 5.51, 0], [0.53, -0.15, 0.70]], requires_grad=True)
target = torch.tensor([4.41, 32.12, 1.65])
dist = MultivariateNormal(loc=loc, scale_tril=scale_tril)
loss = -torch.nansum(dist.log_prob(target))

print(f"Actual: {target}\n\nMean: {loc}\nscale_tril: {scale_tril}\n")
print(f"True STD:\n{np.sqrt((scale_tril @ scale_tril.T).detach().numpy())}\n")

grad_loc = autograd(loss, inputs=loc, create_graph=True)
hess_loc = autograd(grad_loc[0].nansum(), inputs=loc, retain_graph=True)
print(f"Mean Gradient: {grad_loc[0]}")
print(f"Mean Hessian: {hess_loc[0]}\n")
grad_scale_tril = autograd(loss, inputs=scale_tril, create_graph=True)
hess_scale_tril = autograd(grad_scale_tril[0].nansum(), inputs=scale_tril, retain_graph=True)
print(f"STD Gradient: {grad_scale_tril[0]}")
print(f"STD Hessian: {hess_scale_tril[0]}")

In [None]:
scale_tril @ scale_tril.T

In [None]:
dist.covariance_matrix