In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

plt.rcParams.update({'font.size': 20})

NUMBER_EXAMPLES = 100000 # Number of intervals

MEAN = [0, 0, 0] # Means for the three features

In [2]:
import contextlib
import io

def generate_xs(cov):
    x1s, x2s, x3s = np.random.multivariate_normal(MEAN, cov, size=NUMBER_EXAMPLES).T

    x2s = -0.2 * x2s**2 
    x3s = +0.05 * x3s**3

    return x1s, x2s, x3s

def generate_ys(coeffs, pows, x1s, x2s, x3s):
    a, b1, b2, b3, c12, c13, c23 = coeffs
    pow1, pow2, pow3 = pows

    return np.random.poisson(lam=np.exp(a + b1*x1s**pow1 + b2*x2s**pow2 + b3*x3s**pow3 + c12*x1s*x2s + c13*x1s*x3s + c23*x2s*x3s))

def fit_poisson_model_of_ys(data, features):
    x_model = sm.add_constant(data[features])
    model = sm.Poisson(data['y'], x_model).fit()
    num_feat = len(features)
    data[f'predicted_y_model{num_feat}'] = model.predict(x_model)
    data[f'standardized_residuals_model{num_feat}'] = (
        data[f'predicted_y_model{num_feat}'] - data['y'] 
    ) / np.sqrt(data[f'predicted_y_model{num_feat}'])
    return model.params

def fit_poisson_model_wrapper(data): 
    # Fit Poisson Model 1: y ~ x1
    coefficients_model1 = fit_poisson_model_of_ys(data, ['x1'])
    print("Coefficients for Model 1:", coefficients_model1)

    # Fit Poisson Model 2: y ~ x1 + x2
    coefficients_model2 = fit_poisson_model_of_ys(data, ['x1', 'x2'])
    print("Coefficients for Model 2:", coefficients_model2)

    # Fit Poisson Model 3: y ~ x1 + x2 + x3
    coefficients_model3 = fit_poisson_model_of_ys(data, ['x1', 'x2', 'x3'])
    print("Coefficients for Model 3:", coefficients_model3)

def compute_bias_effect_sizes(data, num_feat): 
    df = data.copy()

    feature_names = ['x1', 'x2', 'x3']
    features = df[feature_names]  
    scaler = StandardScaler()
    standardized_features = scaler.fit_transform(features)

    standardized_residuals = df[f'standardized_residuals_model{num_feat}']

    model = LinearRegression()
    model.fit(standardized_features, standardized_residuals)

    # print(f"How much direct effect does each feature have on the bias of Model {num_feat}?")
    coefficients = model.coef_
    name_max, abs_coef_max = None, 0
    for name, coef in zip(feature_names, coefficients):
        # print(f"Effect size of {name}: {coef}")
        if abs(coef) > abs_coef_max:
            name_max, abs_coef_max = name, abs(coef)

    print(
        f"Feature with the largest direct effect on the bias of Model {num_feat}: "
        f"{name_max} with absolute effect size {abs_coef_max}"
    )

def compute_bias_effect_sizes_wrapper(data):
    compute_bias_effect_sizes(data, num_feat=1)
    compute_bias_effect_sizes(data, num_feat=2)
    compute_bias_effect_sizes(data, num_feat=3)

def compute_bias_effect_sizes_for_various_x_distributions_and_y_models():
    cov1 = [
        [1.0, 0.8, 0.5],  # Variance of X1 and covariances with X2 and X3
        [0.8, 1.0, 0.3],  # Covariance of X2 with X1 and variance of X2 and covariance with X3
        [0.5, 0.3, 1.0]   # Covariance of X3 with X1 and X2 and variance of X3
    ]
    cov2 = [
        [1.0, 0.0, 0.0],  # Variance of X1 and covariances with X2 and X3
        [0.0, 1.0, 0.0],  # Covariance of X2 with X1 and variance of X2 and covariance with X3
        [0.0, 0.0, 1.0]   # Covariance of X3 with X1 and X2 and variance of X3
    ]
    covs = [cov1, cov2] 

    coefficients = [
        [5, 0.5, 0.4, 0.3, 0, 0, 0],
        [5, 0.05, 0.04, 0.03, 0, 0, 0],
    ]
    powers = [
        [1, 1, 1],
        [2, 2, 2],
    ]

    for cov in covs:
        x1s, x2s, x3s = generate_xs(cov)
        for coeffs, pows in zip(coefficients, powers): 
            print('x covariances', cov)
            print('x coefficients', coeffs)
            print('x powers', pows)             
            print()
            ys = generate_ys(coeffs, pows, x1s, x2s, x3s)
            data = pd.DataFrame({'x1': x1s, 'x2': x2s, 'x3': x3s, 'y': ys})
            with contextlib.redirect_stdout(io.StringIO()):
                fit_poisson_model_wrapper(data)
            compute_bias_effect_sizes_wrapper(data)
            print()
            print('***************************************')
            print()

compute_bias_effect_sizes_for_various_x_distributions_and_y_models()

x covariances [[1.0, 0.8, 0.5], [0.8, 1.0, 0.3], [0.5, 0.3, 1.0]]
x coefficients [5, 0.5, 0.4, 0.3, 0, 0, 0]
x powers [1, 1, 1]

Feature with the largest direct effect on the bias of Model 1: x2 with absolute effect size 1.1701629498670179
Feature with the largest direct effect on the bias of Model 2: x3 with absolute effect size 0.7888239488950385
Feature with the largest direct effect on the bias of Model 3: x2 with absolute effect size 0.0017621630918475677

***************************************

x covariances [[1.0, 0.8, 0.5], [0.8, 1.0, 0.3], [0.5, 0.3, 1.0]]
x coefficients [5, 0.05, 0.04, 0.03, 0, 0, 0]
x powers [2, 2, 2]

Feature with the largest direct effect on the bias of Model 1: x2 with absolute effect size 0.856150566291958
Feature with the largest direct effect on the bias of Model 2: x2 with absolute effect size 0.010142767855590675
Feature with the largest direct effect on the bias of Model 3: x2 with absolute effect size 0.0101515614371161

**************************