# `duckreg`: Out-of-memory regressions with `duckdb`

Carefully examine the underlying queries that generate the data.

In [16]:
import numpy as np
import pandas as pd
from duckreg.estimators import DuckRegression
import duckdb
import pyfixest as pf

## Cross-sectional Regression

In [2]:
# Generate sample data
def generate_sample_data(N=10_000_000, seed=12345):
    rng = np.random.default_rng(seed)
    D = rng.choice([0, 1], size=(N, 1))
    X = rng.choice(range(20), (N, 2), True)
    Y = D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
    Y2 = -1 * D + X @ np.array([1, 2]).reshape(2, 1) + rng.normal(size=(N, 1))
    df = pd.DataFrame(
        np.concatenate([Y, Y2, D, X], axis=1), columns=["Y", "Y2", "D", "f1", "f2"]
    ).assign(rowid=range(N))
    return df


# Function to create and populate DuckDB database
def create_duckdb_database(df, db_name="large_dataset.db", table="data"):
    conn = duckdb.connect(db_name)
    conn.execute(f"DROP TABLE IF EXISTS {table}")
    conn.execute(f"CREATE TABLE {table} AS SELECT * FROM df")
    conn.close()
    print(f"Data loaded into DuckDB database: {db_name}")

In [3]:
# Generate and save data
df = generate_sample_data()
db_name = 'large_dataset.db'
create_duckdb_database(df, db_name)

Data loaded into DuckDB database: large_dataset.db


In [4]:
db_name = 'large_dataset.db'
conn = duckdb.connect(db_name)
query = "SELECT * FROM data limit 5"
conn.execute(query).fetchdf()

Unnamed: 0,Y,Y2,D,f1,f2,rowid
0,27.226866,24.444717,1.0,16.0,5.0,0
1,35.088713,35.392007,0.0,1.0,17.0,1
2,22.472292,21.185366,1.0,6.0,8.0,2
3,39.842856,38.01272,0.0,9.0,15.0,3
4,23.136978,22.168634,0.0,10.0,6.0,4


### Regression

$$
Y_i + X_i \beta + \alpha_i + \epsilon_i
$$


In [5]:
m = DuckRegression(
    db_name='large_dataset.db',
    table_name='data',
    formula="Y ~ D + f1 + f2",
    cluster_col="f1",
    n_bootstraps=100,
    seed = 232
)
m.fit()
results = m.summary()

restab = pd.DataFrame(
    np.c_[results["point_estimate"], results["standard_error"]],
    columns=["point_estimate", "standard_error"],
)
restab

100%|██████████| 100/100 [00:27<00:00,  3.60it/s]


Unnamed: 0,point_estimate,standard_error
0,-0.000274,0.00067
1,0.999347,0.000727
2,1.000035,4.7e-05
3,2.000067,4e-05


powered by the following queries:

In [6]:
print(m.agg_query)
print(m.bootstrap_query)


        SELECT D, f1, f2, COUNT(*) as count, SUM(Y) as sum_Y
        FROM data
        GROUP BY D, f1, f2
        

            SELECT D, f1, f2, COUNT(*) as count, SUM(Y) as sum_Y
            FROM data
            WHERE f1 IN (SELECT unnest((?)))
            GROUP BY D, f1, f2
            


For the `DuckRegression` class, you can choose to run the model fits via the `pyfixest` package, in which case 
the `DuckRegression.fit()` will return a `pyfixest` model object.

In [7]:
m = DuckRegression(
    db_name='large_dataset.db',
    table_name='data',
    formula="Y ~ D + f1 + f2",
    cluster_col="f1",
    n_bootstraps=100,
    seed = 232,
    fitter = "feols"
)
duckreg_fit = m.fit()

feols_fit = pf.feols("Y ~ D | f1 + f2", data = df)
pf.etable([duckreg_fit, feols_fit], digits = 6)

100%|██████████| 100/100 [00:23<00:00,  4.17it/s]


                                     est1                    est2
------------  ---------------------------  ----------------------
depvar                             mean_Y                       Y
-----------------------------------------------------------------
Intercept     -0.000274*** (8e-06.000000)
D              0.999347*** (6e-06.000000)  0.999347*** (0.000941)
f1             1.000035*** (1e-06.000000)
f2             2.000067*** (1e-06.000000)
-----------------------------------------------------------------
f1                                      -                       x
f2                                      -                       x
-----------------------------------------------------------------
R2                                      -                0.994031
S.E. type                    NP Bootstrap                  by: f1
Observations                     10000000                10000000
-----------------------------------------------------------------
Significance lev

## Panel Data

In [8]:
def sim_panel(
    # Parameters
    N=1_000_000,
    T=35,  # Number of units and time periods
    T0=5,  # Treatment starts at T0
    tau=0.005,
    sigma_list=[5, 2, 0.01, 2],
    seed=42,
):
    np.random.seed(seed)
    sigma_unit, sigma_time, sigma_tt, sigma_e = sigma_list
    # Generate data
    unit_ids = np.repeat(np.arange(N), T)
    time_ids = np.tile(np.arange(T), N)

    # Generate unit-specific intercepts and time trends
    unit_fe = np.random.normal(0, sigma_unit, N)
    time_fe = np.random.normal(
        0, sigma_time, T
    )  # Common shocks for all units at each time period
    unit_tt = np.random.normal(0, sigma_tt, N)

    # Generate treatment indicator (randomly assigned)
    W = np.random.binomial(1, 0.5, N)
    W = np.repeat(W, T)
    W = W * (time_ids >= T0)

    rho = 0.7  # Autoregressive parameter for residuals
    # Generate serially correlated residuals for each unit (optimized version)
    residuals = np.zeros((N, T))
    residuals[:, 0] = np.random.normal(0, sigma_e, N)
    epsilon = np.random.normal(0, 1, (N, T - 1))
    factor = 0.5 * np.sqrt(1 - rho**2)
    for t in range(1, T):
        residuals[:, t] = rho * residuals[:, t - 1] + factor * epsilon[:, t - 1]
    # iid
    # residuals = np.random.normal(0, 0.5, N*T)

    # Generate outcome
    Y = (
        np.repeat(unit_fe, T)
        + np.repeat(unit_tt, T) * time_ids
        + tau * W  # Treatment effect is 1
        + np.tile(time_fe, N)  # time FE
        + residuals.flatten()
    )  # Individual noise

    # Create DataFrame
    df = pd.DataFrame({"unit": unit_ids, "time": time_ids, "Y": Y, "W": W})

    return df


df = sim_panel(tau=1)

In [9]:
db_name, table_name = "panel_data.db", "panel_data"
# write to database
conn = duckdb.connect(db_name)
conn.execute(f"DROP TABLE IF EXISTS {table_name}")
conn.execute(f"CREATE TABLE {table_name} AS SELECT * from df")
conn.close()

Peek at the data

In [10]:
conn = duckdb.connect(db_name)
print(conn.execute("SELECT * FROM panel_data LIMIT 5").fetchdf())
conn.close()

   unit  time         Y  W
0     0     0  3.096240  0
1     0     1  2.656935  0
2     0     2  5.084101  0
3     0     3  3.274999  0
4     0     4  4.638763  0


In [11]:
from duckreg.estimators import DuckMundlak, DuckDoubleDemeaning

### Mundlak


One-way mundlak 

$$
Y_{it} = \alpha + \beta X_{it} + \gamma \bar{X}_{i} + \epsilon_{it}
$$

Two-way mundlak

$$
Y_{it} = \alpha + \beta X_{it} + \gamma \bar{X}_{i, \cdot} + \delta \bar{X}_{\cdot, t} + \epsilon_{it}
$$

both of which can be compressed easily with `duckdb`.

These representations are much more efficient than the above general procedure because the unit and time fixed effects are typically very high dimensional, but covariate averages are not. Also see [Arkhangelsky and Imbens](https://arxiv.org/abs/1807.02099) on this.

In [12]:
mundlak = DuckMundlak(
    db_name="panel_data.db",
    table_name="panel_data",
    outcome_var="Y",
    covariates=["W"],
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=50,
    seed = 929
)
mundlak.fit()

mundlak_results = mundlak.summary()

restab = pd.DataFrame(
    np.c_[mundlak_results["point_estimate"], mundlak_results["standard_error"]],
    columns=["point_estimate", "standard_error"],
)
restab

100%|██████████| 50/50 [04:45<00:00,  5.71s/it]


Unnamed: 0,point_estimate,standard_error
0,0.896134,0.005633
1,1.003877,0.001765
2,0.009106,0.009642
3,-2.413955,0.002209


Powered by the following sequence of queries

In [13]:
print(mundlak.unit_avg_query)
print(mundlak.time_avg_query)
print(mundlak.design_matrix_query)
print(mundlak.compress_query)
print(mundlak.bootstrap_query)


        CREATE TEMP TABLE unit_avgs AS
        SELECT unit,
               AVG(W) AS avg_W_unit
        FROM panel_data
        GROUP BY unit
        

            CREATE TEMP TABLE time_avgs AS
            SELECT time,
                   AVG(W) AS avg_W_time
            FROM panel_data
            GROUP BY time
            

        CREATE TEMP TABLE design_matrix AS
        SELECT
            t.unit,
            t.time,
            t.Y,
            t.W,
            u.avg_W_unit
            , tm.avg_W_time
        FROM panel_data t
        JOIN unit_avgs u ON t.unit = u.unit
        JOIN time_avgs tm ON t.time = tm.time
        

        SELECT
            W,
            avg_W_unit
            , avg_W_time,
            COUNT(*) as count,
            SUM(Y) as sum_Y
        FROM design_matrix
        GROUP BY W,
                    avg_W_unit
                    , avg_W_time
        

            SELECT
                W,
                avg_W_unit
                , avg_W_time,
      

### Double Demeaning

$$
Y_{it} = \alpha + \ddot{X}_{it} \beta + \epsilon_{it}
$$

where $\ddot{X}_{it} = X_{it} - \bar{X}_{i, \cdot} - \bar{X}_{\cdot, t} + \bar{X}$

In [14]:
double_demean = DuckDoubleDemeaning(
    db_name="panel_data.db",
    table_name="panel_data",
    outcome_var="Y",
    treatment_var="W",
    unit_col="unit",
    time_col="time",
    cluster_col="unit",
    n_bootstraps=100,
    seed = 828
)

double_demean.fit()

dd_results = double_demean.summary()

restab = pd.DataFrame(
    np.c_[dd_results["point_estimate"], dd_results["standard_error"]],
    columns=["point_estimate", "standard_error"],
)
restab

100%|██████████| 100/100 [10:14<00:00,  6.15s/it]


Unnamed: 0,point_estimate,standard_error
0,0.295524,0.003857
1,1.003877,0.002097


In [15]:
print(double_demean.overall_mean_query)
print(double_demean.unit_mean_query)
print(double_demean.time_mean_query)
print(double_demean.double_demean_query)
print(double_demean.compress_query)
print(double_demean.bootstrap_query)


        CREATE TEMP TABLE overall_mean AS
        SELECT AVG(W) AS mean_W
        FROM panel_data
        

        CREATE TEMP TABLE unit_means AS
        SELECT unit, AVG(W) AS mean_W_unit
        FROM panel_data
        GROUP BY unit
        

        CREATE TEMP TABLE time_means AS
        SELECT time, AVG(W) AS mean_W_time
        FROM panel_data
        GROUP BY time
        

        CREATE TEMP TABLE double_demeaned AS
        SELECT
            t.unit,
            t.time,
            t.Y,
            t.W - um.mean_W_unit - tm.mean_W_time + om.mean_W AS ddot_W
        FROM panel_data t
        JOIN unit_means um ON t.unit = um.unit
        JOIN time_means tm ON t.time = tm.time
        CROSS JOIN overall_mean om
        

        SELECT
            ddot_W,
            COUNT(*) as count,
            SUM(Y) as sum_Y
        FROM double_demeaned
        GROUP BY ddot_W
        

            SELECT
                ddot_W,
                COUNT(*) as count,
                SUM(Y) a