In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import statsmodels.api as sm

from sklearn import preprocessing

In [2]:
class BinomialFactorial(object):
    def __init__(
        self,
        n,
        p_t=0.5,
        k=1,
        sigma=0.1,
        order=1,
        sparsity=0.5,
        beta_seed=42,
    ) -> None:
        self.n = n
        self.p_t = p_t
        self.k = k
        self.sigma = sigma
        self.order = order
        self.sparsity = sparsity
        self.beta_seed = beta_seed
        # initialize beta random number generator
        self.rng_beta = np.random.default_rng(beta_seed)
        # initialize interaction expansion transformation
        self.xfm = preprocessing.PolynomialFeatures(
            degree=self.order + 1, interaction_only=True, include_bias=True
        )
        _ = self.xfm.fit_transform(np.zeros((1, self.k), dtype="float32"))
        # sample ground truth betas
        self.beta = self.rng_beta.normal(0, 1, self.xfm.n_output_features_).astype(
            "float32"
        )
        zero_indices = self.rng_beta.choice(
            self.xfm.n_output_features_,
            size=int(self.xfm.n_output_features_ * self.sparsity),
            replace=False,
        )
        self.beta[zero_indices] = 0.0

    def sample(self, seed=None):
        self.rng = np.random.default_rng(seed)
        # sample treatment array
        t = self.rng.binomial(1, self.p_t, (self.n, self.k)).astype("float32")
        # expand treatment array
        T = self.xfm.fit_transform(t)
        # build response surface
        self.mu = T @ self.beta
        # sample outcome
        self.eps = self.rng.normal(0, self.sigma, size=self.n)
        y = self.mu + self.eps
        return t, y

## Initalize Dataset

In [9]:
n = 1000
k = 3
sigma = 0.1
order = 3
sparsity = 0.5

ds = BinomialFactorial(
    n=n,
    k=k,
    sigma=sigma,
    order=order,
    sparsity=sparsity,
    beta_seed=42,
)

## Print the true beta terms 
Order is [intercept, $\beta_{t_1}$, $\beta_{t_2}$, $\beta_{t_3}$, $\beta_{t_1, t_2}$, $\beta_{t_1, t_3}$, $\beta_{t_2, t_3}$, $\beta_{t_1, t_2, t_3}$]

In [10]:
print(ds.beta)

[ 0.        -1.0399841  0.         0.        -1.9510351  0.
  0.1278404 -0.3162426]


## Sample a dataset

In [17]:
t, y = ds.sample(seed=0)
print(t.shape, y.shape)

(1000, 3) (1000,)


## Fit an incorrect linear model

In [12]:
m = sm.OLS(y, sm.add_constant(t))
results = m.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.812
Model:                            OLS   Adj. R-squared:                  0.812
Method:                 Least Squares   F-statistic:                     1437.
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        11:49:15   Log-Likelihood:                -783.35
No. Observations:                1000   AIC:                             1575.
Df Residuals:                     996   BIC:                             1594.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5058      0.034     14.866      0.0

## Fit a well specified linear model

In [16]:
T = preprocessing.PolynomialFeatures(
    degree=order + 1, interaction_only=True, include_bias=True,
).fit_transform(t)
print(T.shape)

(1000, 8)


In [18]:
m = sm.OLS(y, T)
results = m.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.993
Model:                            OLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                 2.093e+04
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        11:50:46   Log-Likelihood:                 881.00
No. Observations:                1000   AIC:                            -1746.
Df Residuals:                     992   BIC:                            -1707.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0085      0.009      0.964      0.3

In [20]:
print(ds.beta)

[ 0.        -1.0399841  0.         0.        -1.9510351  0.
  0.1278404 -0.3162426]


we see that the predicted coefficients, coef, are good approximations of the true beta parameters of the model