## Experiments with real world data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from sklearn.linear_model import LassoCV
from factorial_model import FactorialModel
from forward_selection import ForwardSelection
from evaluate import *

from scipy.special import comb

### Charitable Giving

In [3]:
# Load the data into a pandas dataframe
df = pd.read_stata('data/AER merged.dta')

In [4]:
# Convert df into numpy array
X = df[['ratio', 'ratio2', 'ratio3', 'size25', 'size50', 'size100', 'sizeno', 'askd1', 'askd2', 'askd3', 'amount']].values
X[:, 0] = (X[:, 0] == 1).astype(int)

In [5]:
# Initialize factorial model
MAX_DEGREE = 3
fm = FactorialModel(
    n=X.shape[0],
    k=X.shape[1] - 1,
    degree=MAX_DEGREE,
    contrast_coding=True,
    beta_seed=0,
)

In [6]:
fm.convert_and_split_data(X[:, :-1].astype(int), X[:, -1].astype(float), is_dummy_coded=True, seed=None)
evaluate_lasso(fm)

In [7]:
fs = ForwardSelection(fm.T_train, fm.y_train, fm.k, MAX_DEGREE, strong_heredity=False)
evaluate_forward_selection(fs, fm.T_test, fm.y_test)

In [8]:
print(f"Lasso MSE: {fm.mse}")
print(f"Forward Selection MSE: {fs.mse}")

Lasso MSE: 58.912250366960755
Forward Selection MSE: 58.89970243384569


In [9]:
print(f"Lasso R^2: {fm.r2}")
print(f"Forward Selection R^2: {fs.r2}")

Lasso R^2: 0.0003582878972400172
Forward Selection R^2: 0.0


## Appendix

### 5-bit DAC process (Liu and Huang, NIST)

In [None]:
# Load data and take treatment and response variables only
dac = np.loadtxt("data/LIU.DAT", skiprows=25)
dac = dac[:, :6]

In [None]:
# Initialize factorial model
fm = FactorialModel(
    n=dac.shape[0],
    k=dac.shape[1] - 1,
    degree=dac.shape[1] - 1,
    contrast_coding=True,
    beta_seed=0,
)

In [None]:
# Evaluate two models for NUM_TRIALS
NUM_TRIALS = 100
lasso_mses = []
fs_mses = []
lasso_r2s = []
fs_r2s = []
lasso_betas = []
fs_betas = []
lasso_expected_outcomes = []
fs_expected_outcomes = []

for i in range(NUM_TRIALS):
    fm.convert_and_split_data(dac[:, 1:], dac[:, 0], is_dummy_coded=True, seed=None)
    evaluate_lasso(fm)
    lasso_mses.append(fm.mse)
    lasso_r2s.append(fm.r2)
    lasso_betas.append(fm.beta_hat)
    lasso_expected_outcomes.append(fm.expected_outcomes)

    fs = ForwardSelection(fm.T_train, fm.y_train, fm.k, fm.degree, strong_heredity=False)
    evaluate_forward_selection(fs, fm.T_test, fm.y_test)
    fs_mses.append(fs.mse)
    fs_r2s.append(fs.r2)
    fs_betas.append(fs.results.params)
    beta_mask = fm.pf.fit_transform(fm.pf.powers_)
    fs_expected_outcome = beta_mask @ fs.results.params
    fs_expected_outcomes.append(fs_expected_outcome)

avg_lasso_betas = np.mean(lasso_betas, axis=0)
avg_fs_betas = np.mean(fs_betas, axis=0)
avg_lasso_expected_outcomes = np.mean(lasso_expected_outcomes, axis=0)
avg_fs_expected_outcomes = np.mean(fs_expected_outcomes, axis=0)

In [None]:
# Compute sparsity for theoretical lower bound on observations
dac_sparsity = np.count_nonzero(avg_lasso_betas)

In [None]:
# MSE comparison
avg_lasso_mse = np.mean(np.array(lasso_mses))
avg_fs_mses = np.mean(np.array(fs_mses))
std_lasso_mse = np.std(np.array(lasso_mses))
std_fs_mses = np.std(np.array(fs_mses))
print(f"Lasso MSE: {avg_lasso_mse} +/- {std_lasso_mse}")
print(f"Forward Selection MSE: {avg_fs_mses} +/- {std_fs_mses}")

In [None]:
# R2 comparison
avg_lasso_r2 = np.mean(np.array(lasso_r2s))
avg_fs_r2 = np.mean(np.array(fs_r2s))
std_lasso_r2 = np.std(np.array(lasso_r2s))
std_fs_r2 = np.std(np.array(fs_r2s))
print(f"Lasso R2: {avg_lasso_r2} +/- {std_lasso_r2}")
print(f"Forward Selection R2: {avg_fs_r2} +/- {std_fs_r2}")

In [None]:
# Plot MSE boxplot
plt.figure()
plt.boxplot([lasso_mses, fs_mses], labels=["Lasso", "Forward Selection"])
plt.ylabel("MSE")
plt.title("MSE Comparison")
plt.show()

In [None]:
# Plot R2 boxplot
plt.figure()
plt.boxplot([lasso_r2s, fs_r2s])
plt.xticks([1, 2], ["Lasso", "Forward Selection"])
plt.title("R2 Comparison")
plt.show()

In [None]:
# Plot average betas for Lasso and Forward Selection
plt.figure()
bar_width = 0.35
index = np.arange(len(avg_lasso_betas))
plt.bar(index, avg_lasso_betas, bar_width, label="Lasso")
plt.bar(index + bar_width, avg_fs_betas, bar_width, label="Forward Selection")
plt.xlabel("Beta Index")
plt.ylabel("Beta Value")
plt.title("Average Beta Comparison")
plt.legend()
plt.show()

In [None]:
# Plot average expected outcomes for Lasso and Forward Selection
plt.figure()
bar_width = 0.35
index = np.arange(len(avg_lasso_expected_outcomes))
plt.bar(index, avg_lasso_expected_outcomes, bar_width, label="Lasso")
plt.bar(index + bar_width, avg_fs_expected_outcomes, bar_width, label="Forward Selection")
plt.xlabel("Sample Index")
plt.ylabel("Expected Outcome")
plt.title("Average Expected Outcome Comparison")
plt.legend()
plt.show()

### Carlson (2015)

In [None]:
base = importr('base')
utils = importr('utils')
robjects.r('install.packages("FindIt", repos="https://CRAN.R-project.org/")')

In [None]:
findit = importr('FindIt')
robjects.r('data("Carlson", package = "FindIt")')
carlson_rdf = robjects.r['Carlson']
carlson_df = pandas2ri.rpy2py(carlson_rdf)
carlson_df.head()

In [None]:
# Convert Record, Coethnicity and Degree to binary
carlson = carlson_df.iloc[:, :5].to_numpy(dtype=int)
carlson[:, 1] = np.where(carlson[:, 1] < 4, 1, 0)
carlson[:, 3:] = np.where(carlson[:, 3:] == 2, 0, 1)

In [None]:
# Convert 3 promises to 2 dummy coded categories
promises = np.zeros((carlson.shape[0], 3))
promises[np.arange(carlson.shape[0]), carlson[:, 2] - 1] = 1
carlson = np.delete(carlson, 2, axis=1)
carlson = np.insert(carlson, 2, promises[:,:2].T, axis=1)
carlson[:5]

In [None]:
# Initialize factorial model
fm = FactorialModel(
    n=carlson.shape[0],
    k=carlson.shape[1] - 1,
    degree=carlson.shape[1] - 1,
    contrast_coding=True,
    beta_seed=0,
)

In [None]:
# Evaluate two models for NUM_TRIALS
NUM_TRIALS = 30
lasso_mses = []
fs_mses = []
lasso_betas = []
fs_betas = []
lasso_expected_outcomes = []
fs_expected_outcomes = []

for i in range(NUM_TRIALS):
    fm.convert_and_split_data(carlson[:, 1:], carlson[:, 0], is_dummy_coded=True, seed=None)
    evaluate_lasso(fm, logistic=True)
    lasso_mses.append(fm.mse)
    lasso_betas.append(fm.beta_hat)
    lasso_expected_outcomes.append(fm.expected_outcomes)

    fs = ForwardSelection(fm.T_train, fm.y_train, fm.k, fm.degree, strong_heredity=False)
    evaluate_forward_selection(fs, fm.T_test, fm.y_test, logistic=True)
    fs_mses.append(fs.mse)
    fs_betas.append(fs.results.params)
    beta_mask = fm.pf.fit_transform(fm.pf.powers_)
    fs_expected_outcome = beta_mask @ fs.results.params
    fs_expected_outcomes.append(fs_expected_outcome)

avg_lasso_betas = np.mean(lasso_betas, axis=0)
avg_fs_betas = np.mean(fs_betas, axis=0)
avg_lasso_expected_outcomes = np.mean(lasso_expected_outcomes, axis=0)
avg_fs_expected_outcomes = np.mean(fs_expected_outcomes, axis=0)

In [None]:
# Compute sparsity for theoretical lower bound on observations
carlson_sparsity = np.count_nonzero(avg_lasso_betas)

In [None]:
# MSE comparison
avg_lasso_mse = np.mean(np.array(lasso_mses))
avg_fs_mses = np.mean(np.array(fs_mses))
std_lasso_mse = np.std(np.array(lasso_mses))
std_fs_mses = np.std(np.array(fs_mses))
print(f"Lasso MSE: {avg_lasso_mse} +/- {std_lasso_mse}")
print(f"Forward Selection MSE: {avg_fs_mses} +/- {std_fs_mses}")

In [None]:
# Plot MSE boxplot
plt.figure()
plt.boxplot([lasso_mses, fs_mses])
plt.xticks([1, 2], ["Lasso", "Forward Selection"])
plt.ylabel("MSE")
plt.show()

### Lower bound on number of observations

### DAC

In [None]:
num_treatments = dac.shape[1] - 1
num_coeffs = 2 ** num_treatments
general_bound = dac_sparsity * np.log(num_coeffs)

In [None]:
num_obs = np.arange(20, dac.shape[0], 20)
mse_over_num_obs = []

for n in num_obs:
    fm = FactorialModel(
        n=n,
        k=num_treatments,
        degree=num_treatments,
        contrast_coding=True,
        beta_seed=0,
    )

    fm.convert_and_split_data(dac[:n, 1:], dac[:n, 0], is_dummy_coded=True, seed=None)
    fm.fit_lasso()
    fm.predict()
    fm.compute_mse()
    mse_over_num_obs.append(fm.mse)

In [None]:
# Plot MSE over number of observations
plt.figure()
plt.plot(num_obs, mse_over_num_obs, label="MSE")
plt.axvline(general_bound, color="green", linestyle="--", label="General bound")
plt.xlabel("Number of Observations")
plt.ylabel("MSE")
plt.title("DAC: MSE Over Number of Observations")
plt.legend()
plt.show()

### Carlson

In [None]:
num_treatments = carlson.shape[1] - 1
num_coeffs = 2 ** num_treatments
general_bound = carlson_sparsity * np.log(num_coeffs)

In [None]:
num_obs = np.arange(20, carlson.shape[0], 20)
mse_over_num_obs = []

for n in num_obs:
    fm = FactorialModel(
        n=n,
        k=num_treatments,
        degree=num_treatments,
        contrast_coding=True,
        beta_seed=0,
    )

    fm.convert_and_split_data(carlson[:n, 1:], carlson[:n, 0], is_dummy_coded=True, seed=None)
    fm.fit_lasso(logistic=True)
    fm.predict()
    fm.compute_mse()
    mse_over_num_obs.append(fm.mse)

In [None]:
# Plot MSE over number of observations
plt.figure()
plt.plot(num_obs, mse_over_num_obs, label="MSE")
plt.axvline(general_bound, color="green", linestyle="--", label="General bound")
plt.xlabel("Number of Observations")
plt.ylabel("MSE")
plt.title("Carlson: MSE Over Number of Observations")
plt.legend()
plt.show()