### Optimal Binning

In [5]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Generate the data as a thousand draws from a random normal distribution
np.random.seed(123)
data = np.random.normal(loc=0, scale=1, size=1000)

# Function to recursively find optimal bins using Decision Tree with a specified maximum number of splits
def find_optimal_bins(data, max_splits, bin_edges=None):
    if bin_edges is None:
        bin_edges = np.array([np.min(data), np.max(data)])
    
    # Create the predictor variable (data) and the response variable (data)
    X = data[:, np.newaxis]  # Convert data to a 2D array
    y = data

    # Fit the Decision Tree Regressor
    tree_model = DecisionTreeRegressor(max_leaf_nodes=max_splits + 1)
    tree_model.fit(X, y)

    # Get the bin edges from the Decision Tree splits
    splits = np.sort(tree_model.tree_.threshold[tree_model.tree_.feature >= 0])
    bin_edges = np.sort(np.unique(np.concatenate([bin_edges, splits])))

    # If the number of splits is greater than or equal to the specified max_splits, return the bin edges
    if len(bin_edges) - 1 >= max_splits:
        return bin_edges
    else:
        # Recursively split further until reaching max_splits
        return find_optimal_bins(data, max_splits, bin_edges)

# Calculate the Mean Squared Error (MSE)
def calculate_mse(data, bin_means, bin_edges):
    binned_data = np.array([bin_means[np.digitize(val, bin_edges, right=True) - 1] for val in data])
    return np.mean((data - binned_data)**2)

# Set the maximum number of splits (bins - 1)
max_splits = 15

# Find the optimal bins using Decision Tree with a specified maximum number of splits
bin_edges = find_optimal_bins(data, max_splits)

# Calculate the bin means based on the data
bin_means = np.array([(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(len(bin_edges) - 1)])

# Calculate the Mean Squared Error (MSE)
mse = calculate_mse(data, bin_means, bin_edges)

print("Optimal bin edges:", bin_edges)
print("MSE:", mse)

Optimal bin edges: [-3.23105501 -2.34872711 -1.69677424 -1.33771819 -0.99379808 -0.74967471
 -0.49406621 -0.26091591 -0.03464736  0.18503929  0.39758199  0.62069392
  0.86372635  1.13615036  1.46575928  2.13694978  3.57157922]
MSE: 0.04942159608021452
