In [1]:
import numpy as np
import pandas as pd

### Data Generation

Metrics
- Continuous
- Binary
- Ratio

Data structure
- User-level data and randomization
- Pre-treatment metrics and other covariates for variance reduction and bias correction
- Treatment Indicator

In [None]:
# parameters
n_users=1000
n_features=10 
n_confounders=4
treatment_effect_func=lambda x: 5
endogenous=False
seed=123


In [None]:
np.random.seed(seed)

# Covariates and Features
W = np.random.normal(0, 1, size=(n_users, n_features))

# Treatment Effect
X = W[:, np.random.choice(n_features, n_confounders, replace=False)]

TE = np.array([treatment_effect_func(x_i) for x_i in X])

# Treatment Assignment
if endogenous:
    # assignment depends on X
    coefs_T = np.random.uniform(0, 1, size=n_features)
    log_odds = np.dot(W[:, :5], coefs_T[:5]) + np.random.uniform(-1, 1, size=n_users)
    T_sigmoid = 1 / (1 + np.exp(-log_odds))
    T = np.array([np.random.binomia(1,p) for p in T_sigmoid])
else:
    T = np.random.binomial(1, 0.5, n_users)

coefs_Y = np.random.uniform(0, 1, size=)

In [None]:
def generate_ab_test_data(
    n_users=1000,  # Number of users
    n_covariates=4,  # Number of individual characteristics
    n_features=10,  # Number of feature variables
    treatment_effect_func=lambda x: np.exp(2 * x[0]),  # Heterogeneous treatment effect
    beta=5,  # Baseline treatment effect
    endogenous=False,  # Whether treatment assignment is endogenous
    seed=42  # Random seed for reproducibility
):
    np.random.seed(seed)

    # Generate features
    W = np.random.normal(0, 1, size=(n_users, n_features))

    # Select a random subset of W as covariates X
    covariate_indices = np.random.choice(n_features, n_covariates, replace=False)
    X = W[:, covariate_indices]

    # Define heterogeneous treatment effects
    TE = np.array([treatment_effect_func(x_i) for x_i in X])

    # Generate treatment assignment
    if endogenous:
        # Treatment assignment depends on specific features
        coefs_T = np.random.uniform(0, 1, size=n_features)
        log_odds = np.dot(W, coefs_T) + np.random.uniform(-1, 1, size=n_users)
        T_sigmoid = 1 / (1 + np.exp(-log_odds))
        T = np.array([np.random.binomial(1, p) for p in T_sigmoid])
    else:
        # Random treatment assignment
        T = np.random.binomial(1, 0.5, n_users)

    # Generate outcomes
    coefs_Y = np.random.uniform(0, 1, size=n_features)
    epsilon = np.random.uniform(-1, 1, size=n_users)
    Y = TE * T + np.dot(W, coefs_Y) + epsilon

    # Create DataFrame
    data = pd.DataFrame(
        np.hstack((
            np.arange(1, n_users + 1).reshape(-1, 1),
            T.reshape(-1, 1),
            X,
            W,
            Y.reshape(-1, 1)
        )),
        columns=[
            "user_id", "T" 
        ] + [f"X{i + 1}" for i in range(n_covariates)] + [f"W{i + 1}" for i in range(n_features)] + ["Y"]
    )

    return data

# Example usage
data = generate_ab_test_data(n_users=1000, endogenous=True)
print(data.head())
