# Compressed Ridge Regression 

Comparison with scikit-learn

Ridge is computable in closed-form using normal equations:

$$
\hat{\beta} = (X^TX + \lambda I)^{-1}X^\top y
$$

We can also show [[ESL](https://hastie.su.domains/ElemStatLearn/) Chapter 3 problem 3.12] that we can solve ridge with the following design matrix

$$
\widetilde{X} = \begin{bmatrix} X \\ \sqrt{\lambda} I \end{bmatrix} 
$$

and response vector
$$
\widetilde{y} = \begin{bmatrix} y \\ 0_{p \times s1} \end{bmatrix} 
$$

where $I$ is the identity matrix and $0_{p \times 1}$ is a vector of zeros of length $p$ (the number of features). We add $p$ rows to our design matrix, which is typically not a big deal since $p$ is usually much smaller than $n$ in settings we're considering.

This lets us use optimized OLS routines [e.g. Lapack drivers such as `gelsd` and `gelsy`] that don't require brittle matrix inversions. This, like Frisch-Waugh-Lovell, might seem peculiar when one encounters it in the classroom -- don't we have fast computers already? But with large datasets, these numerical tricks can make an enormous difference.

Importantly, we can use the same trick for compression as OLS used in the rest of the package to solve ridge regression.  

In [1]:
import numpy as np
import pandas as pd
import duckdb
import time
from sklearn.linear_model import Ridge, RidgeCV
from duckreg.regularized import DuckRidge

In [2]:
def generate_large_dataset(N=1_000_000, seed=42):
    """Generate large synthetic dataset with discrete covariates"""
    rng = np.random.default_rng(seed)

    # Create discrete covariates (similar to introduction.ipynb)
    D = rng.choice([0, 1], size=(N, 1))
    f1 = rng.choice(range(3), (N, 1), True)
    f2 = rng.choice(range(4), (N, 1), True)
    f3 = rng.choice(range(2), (N, 1), True)

    # True coefficients
    beta_D, beta_f1, beta_f2, beta_f3 = 1.0, 2.0, 1.5, 0.8

    # Generate outcome with some noise
    Y = (
        beta_D * D
        + beta_f1 * f1
        + beta_f2 * f2
        + beta_f3 * f3
        + rng.normal(size=(N, 1))
    )

    df = pd.DataFrame(
        np.concatenate([Y, D, f1, f2, f3], axis=1), columns=["Y", "D", "f1", "f2", "f3"]
    ).assign(rowid=range(N))

    return df


def create_duckdb_database(df, db_name="ridge_test.db", table="data"):
    """Create and populate DuckDB database"""
    conn = duckdb.connect(db_name)
    conn.execute(f"DROP TABLE IF EXISTS {table}")
    conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
    conn.close()
    print(f"Data loaded into DuckDB database: {db_name}")

### compression

In [3]:
"""Test how much compression we achieve"""
print("\n" + "=" * 60)
print("COMPRESSION EFFECTIVENESS TEST")
print("=" * 60)

df = generate_large_dataset(N=1_000_000, seed=42)
db_name = "compression_test.db"
create_duckdb_database(df, db_name)

# Check compressed data size
duck_ridge = DuckRidge(
    db_name=db_name,
    table_name="data",
    formula="Y ~ D + f1 + f2 + f3",
    cv_folds=1,
    seed=42,
)
duck_ridge.prepare_data()
duck_ridge.compress_data()

original_size = len(df)
compressed_size = len(duck_ridge.df_compressed)
compression_ratio = original_size / compressed_size

print(f"Original dataset size: {original_size:,} rows")
print(f"Compressed dataset size: {compressed_size:,} rows")
print(f"Compression ratio: {compression_ratio:.1f}x")

print(f"\nSample compressed data:")
print(duck_ridge.df_compressed.head())



COMPRESSION EFFECTIVENESS TEST
Data loaded into DuckDB database: compression_test.db
Original dataset size: 1,000,000 rows
Compressed dataset size: 48 rows
Compression ratio: 20833.3x

Sample compressed data:
     D   f1   f2   f3  count          sum_Y       sum_Y_sq    mean_Y
0  0.0  1.0  1.0  0.0  20775   72463.054506  273419.426647  3.487993
1  0.0  1.0  0.0  1.0  20928   58647.566319  185433.134264  2.802349
2  1.0  0.0  3.0  1.0  20824  130984.656395  844668.731370  6.290081
3  1.0  0.0  1.0  0.0  20792   51822.931946  149741.285761  2.492446
4  1.0  1.0  2.0  0.0  21132  126700.405564  780735.341177  5.995666


In [None]:
"""Compare DuckRidge vs sklearn Ridge performance"""
print("=" * 60)
print("DUCKRIDGE VS SKLEARN RIDGE COMPARISON")
print("=" * 60)

# Generate data
print("Generating synthetic dataset...")
df = generate_large_dataset(N=50_000_000, seed=42)
db_name = "ridge_test.db"
create_duckdb_database(df, db_name)

print(f"Dataset shape: {df.shape}")
print(f"Sample data:\n{df.head()}\n")

# Test different lambda values
lambda_values = np.logspace(-5, 10, 10)

for lam in lambda_values:
    print(f"\n--- Testing λ = {lam} ---")

    # DuckRidge
    print("Running DuckRidge...")
    start_time = time.time()

    duck_ridge = DuckRidge(
        db_name=db_name,
        table_name="data",
        formula="Y ~ D + f1 + f2 + f3",
        lambda_grid=[lam],  # Single lambda for fair comparison
        cv_folds=1,  # No CV for speed
        seed=42,
    )
    duck_ridge.fit(lambda_selection="single")
    duck_time = time.time() - start_time
    duck_coefs = duck_ridge.point_estimate

    print(f"DuckRidge time: {duck_time:.3f} seconds")
    print(f"DuckRidge coefficients: {duck_coefs}")

    # Sklearn Ridge (on full data)
    print("Running sklearn Ridge...")
    start_time = time.time()

    # Prepare sklearn data
    X_sklearn = df[["D", "f1", "f2", "f3"]].values
    y_sklearn = df["Y"].values

    sklearn_ridge = Ridge(alpha=lam, fit_intercept=True, solver="svd")
    sklearn_ridge.fit(X_sklearn, y_sklearn)
    sklearn_time = time.time() - start_time

    sklearn_coefs = np.concatenate([[sklearn_ridge.intercept_], sklearn_ridge.coef_])

    print(f"Sklearn time: {sklearn_time:.3f} seconds")
    print(f"Sklearn coefficients: {sklearn_coefs}")

    # Compare results
    speedup = sklearn_time / duck_time
    coef_diff = np.abs(duck_coefs - sklearn_coefs)
    max_diff = np.max(coef_diff)

    print(f"Speedup: {speedup:.1f}x")

The penalization factor $\lambda$ is scaled differently in the compressed form, so the coefficients are not identical across the two methods for a given $\lambda$. However, for a sufficiently fine grid of $\lambda$ values, we can still find the optimal $\lambda$ that minimizes the cross-validated error, where compression yields even greater speedups.

### cross-validation

In [None]:
"""Test DuckRidge cross-validation"""
print("\n" + "=" * 60)
print("DUCKRIDGE CROSS-VALIDATION TEST")
print("=" * 60)

# Use smaller dataset for CV demo
print("Generating dataset for CV test...")
df = generate_large_dataset(N=10_000_000, seed=42)
db_name = "ridge_cv_test.db"
create_duckdb_database(df, db_name)

# Test CV with lambda grid
print("Running cross-validation...")
start_time = time.time()

duck_ridge_cv = DuckRidge(
    db_name=db_name,
    table_name="data",
    formula="Y ~ D + f1 + f2 + f3",
    lambda_grid=np.logspace(-3, 1, 20),  # 20 lambda values
    cv_folds=5,
    seed=42,
)
duck_ridge_cv.fit(lambda_selection="cv")
cv_time = time.time() - start_time

print(f"CV time: {cv_time:.3f} seconds")
print(f"Best lambda: {duck_ridge_cv.best_lambda:.6f}")
print(f"Best coefficients: {duck_ridge_cv.point_estimate}")

# Compare with sklearn RidgeCV
print("\nRunning sklearn RidgeCV...")
start_time = time.time()

X_sklearn = df[["D", "f1", "f2", "f3"]].values
y_sklearn = df["Y"].values

sklearn_ridge_cv = RidgeCV(alphas=np.logspace(-3, 1, 20), cv=5, fit_intercept=True)
sklearn_ridge_cv.fit(X_sklearn, y_sklearn)
sklearn_cv_time = time.time() - start_time

sklearn_cv_coefs = np.concatenate(
    [[sklearn_ridge_cv.intercept_], sklearn_ridge_cv.coef_]
)

print(f"Sklearn CV time: {sklearn_cv_time:.3f} seconds")
print(f"Sklearn best alpha: {sklearn_ridge_cv.alpha_:.6f}")
print(f"Sklearn coefficients: {sklearn_cv_coefs}")

cv_speedup = sklearn_cv_time / cv_time
print(f"CV Speedup: {cv_speedup:.1f}x")


```

============================================================
DUCKRIDGE CROSS-VALIDATION TEST
============================================================
Generating dataset for CV test...
Data loaded into DuckDB database: ridge_cv_test.db
Running cross-validation...
CV time: 0.549 seconds
Best lambda: 2.335721
Best coefficients: [1.10838176e-03 9.99475426e-01 1.99977411e+00 1.49963835e+00
 8.00333145e-01]

Running sklearn RidgeCV..

Sklearn CV time: 23.304 seconds
Sklearn best alpha: 3.792690
Sklearn coefficients: [1.10960621e-03 9.99474844e-01 1.99977367e+00 1.49963817e+00
 8.00332678e-01]
CV Speedup: 27.9x
```