In [8]:
# Re-import necessary libraries after execution state reset
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import gaussian_kde

# Function for KDE binning
def kde_binning(data, k):
    kde = gaussian_kde(data)
    bin_edges = np.linspace(np.min(data), np.max(data), k + 1)
    bin_means = [(bin_edges[i] + bin_edges[i+1]) / 2 for i in range(k)]
    quantized = np.digitize(data, bin_edges, right=True)
    return np.array([bin_means[min(q, k-1)] for q in quantized])

# Function for CART binning
def cart_binning(data, k):
    tree = DecisionTreeRegressor(max_leaf_nodes=k)
    tree.fit(data.reshape(-1, 1), data)
    return tree.predict(data.reshape(-1, 1))

# Function for K-means quantization
def kmeans_quantization(data, k):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
    kmeans.fit(data.reshape(-1, 1))
    return kmeans.cluster_centers_[kmeans.predict(data.reshape(-1, 1))].flatten()

# Function for Lloyd-Max quantization
def lloyd_max_quantization(data, k, max_iter=100, tol=1e-5):
    thresholds = np.linspace(np.min(data), np.max(data), k + 1)
    levels = (thresholds[:-1] + thresholds[1:]) / 2

    for _ in range(max_iter):
        quantized = np.digitize(data, thresholds) - 1
        new_levels = np.array([data[quantized == i].mean() if np.any(quantized == i) else levels[i] for i in range(k)])
        new_thresholds = (new_levels[:-1] + new_levels[1:]) / 2
        new_thresholds = np.insert(new_thresholds, 0, np.min(data))
        new_thresholds = np.append(new_thresholds, np.max(data))

        if np.max(np.abs(new_levels - levels)) < tol:
            break

        levels, thresholds = new_levels, new_thresholds

    quantized = np.digitize(data, thresholds) - 1
    return np.array([levels[min(q, k-1)] for q in quantized])

# Function for Dynamic Programming quantization
def dynamic_programming_quantization(data, k):

    data_sorted = np.sort(data)
    n = len(data_sorted)
    cost = np.full((n, k), np.inf)
    breaks = np.zeros((n, k), dtype=int)

    # Precompute variance for all segments to avoid redundant calculations
    variance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            variance_matrix[i, j] = np.var(data_sorted[i:j+1]) * (j - i + 1)

    # Base case for k=1 (single bin)
    for i in range(n):
        cost[i, 0] = variance_matrix[0, i]

    # Fill DP table
    for j in range(1, k):
        for i in range(j, n):
            min_cost, best_split = min(
                [(cost[s, j-1] + variance_matrix[s+1, i], s) for s in range(j-1, i)],
                key=lambda x: x[0]
            )
            cost[i, j] = min_cost
            breaks[i, j] = best_split

    # Backtrack to assign bin levels
    bins = np.zeros(n)
    idx = n - 1
    for j in range(k-1, -1, -1):
        split = breaks[idx, j]
        bins[split+1:idx+1] = j
        idx = split

    quantized_values = np.array([np.mean(data_sorted[bins == i]) for i in range(k)])[bins.astype(int)]
    return np.interp(data, data_sorted, quantized_values)  # Map back to original data

# Function for Percentile Binning
def percentile_binning(data, k):
    percentiles = np.percentile(data, np.linspace(0, 100, k + 1))
    bin_means = [(percentiles[i] + percentiles[i+1]) / 2 for i in range(k)]
    quantized = np.digitize(data, percentiles, right=True)
    return np.array([bin_means[min(q, k-1)] for q in quantized])

# Simulation and MSE evaluation
def evaluate_methods():
    np.random.seed(42)
    distributions = {
        "Normal": np.random.normal(0, 1, 1000),
        "Exponential": np.random.exponential(1, 1000),
        "Bimodal": np.concatenate([np.random.normal(-2, 0.5, 500), np.random.normal(2, 0.5, 500)]),
        "Uniform": np.random.uniform(-2, 2, 1000)
    }
    k_values = [4, 8, 16]
    results = []

    for dist_name, data in distributions.items():
        for k in k_values:
            methods = {
                "KDE Binning": kde_binning(data, k),
                "CART": cart_binning(data, k),
                "K-Means": kmeans_quantization(data, k),
                "Lloyd-Max": lloyd_max_quantization(data, k),
                "Dynamic Programming": dynamic_programming_quantization(data, k),
                "Percentile Binning": percentile_binning(data, k),
            }
            for method, quantized_data in methods.items():
                mse = np.mean((data - quantized_data) ** 2)
                results.append({"Distribution": dist_name, "k": k, "Method": method, "MSE": mse})

    df_results = pd.DataFrame(results)
    return df_results

# Run evaluation and display results
df_results = evaluate_methods()

In [9]:
df_results.groupby("Method")["MSE"].mean().reset_index()

Unnamed: 0,Method,MSE
0,CART,0.043105
1,Dynamic Programming,0.042686
2,K-Means,0.042755
3,KDE Binning,1.204267
4,Lloyd-Max,0.04425
5,Percentile Binning,1.201735
