# Validation

This page shows how different datasets (for 2024) perform at reproducing various official statistics when used with the PolicyEngine US microsimulation model.

Note that the Enhanced CPS dataset is explicitly calibrated to these official statistics, so it is expected to perform well. Since these statistics are large in number and diverse, we expect this to improve the dataset's performance at predicting reform impacts.

In [None]:
from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
from policyengine_us import Microsimulation
import pandas as pd
import plotly.express as px
import numpy as np


def compare_datasets():
    comparison_combined = pd.DataFrame()
    for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]:
        sim = Microsimulation(dataset=dataset)
        weights = sim.calculate("household_weight").values
        loss_matrix, targets_array = build_loss_matrix(dataset, 2024)
        target_names = loss_matrix.columns
        estimates = weights @ loss_matrix.values
        comparison = pd.DataFrame(
            {
                "name": target_names,
                "estimate": estimates,
                "actual": targets_array,
            }
        )
        comparison["error"] = comparison["estimate"] - comparison["actual"]
        comparison["rel_error"] = comparison["error"] / comparison["actual"]
        comparison["abs_error"] = comparison["error"].abs()
        comparison["abs_rel_error"] = (
            (comparison["abs_error"] / comparison["actual"].abs())
            .replace([np.inf, -np.inf], np.nan)
            .fillna(0)
        )
        comparison["dataset"] = dataset.label
        comparison_combined = pd.concat([comparison_combined, comparison])

    return comparison_combined

df = compare_datasets()

merged = pd.merge(
    df[df.dataset == "CPS 2024 (2022-based)"],
    df[df.dataset == "Enhanced CPS 2024"],
    on=["name"],
    suffixes=("_cps", "_ecps"),
)
merged = pd.merge(
    merged,
    df[df.dataset == "PUF 2024 (2015-based)"].rename(columns={col: col + "_puf" for col in df.columns if col != "name"}),
    on=["name"],
)
import pandas as pd
from itables import init_notebook_mode
import itables.options as opt
init_notebook_mode(all_interactive=True)
opt.maxBytes = "1MB"
# Set max cols to inf
merged["ecps_abs_rel_error_change_over_cps"] = merged["abs_rel_error_ecps"] - merged["abs_rel_error_cps"]
merged["ecps_abs_rel_error_change_over_puf"] = merged["abs_rel_error_ecps"] - merged["abs_rel_error_puf"]
merged["ecps_abs_rel_error_change_over_prev_best"] = merged["abs_rel_error_ecps"] - np.minimum(merged["abs_rel_error_cps"], merged["abs_rel_error_puf"])
pd.set_option('display.max_columns', None)
merged.rename(columns={
    "actual_cps": "actual"
})[[
    "name", "actual",
    "estimate_cps",
    "estimate_puf",
    "estimate_ecps",
    "abs_rel_error_cps",
    "abs_rel_error_puf",
    "abs_rel_error_ecps",
    "ecps_abs_rel_error_change_over_cps",
    "ecps_abs_rel_error_change_over_puf",
    "ecps_abs_rel_error_change_over_prev_best",
]]

INFO:root:Targeting Medicaid enrollment for AK with target 231577k
INFO:root:Targeting Medicaid enrollment for AL with target 766009k
INFO:root:Targeting Medicaid enrollment for AR with target 733561k
INFO:root:Targeting Medicaid enrollment for AZ with target 1778734k
INFO:root:Targeting Medicaid enrollment for CA with target 12172695k
INFO:root:Targeting Medicaid enrollment for CO with target 1058326k
INFO:root:Targeting Medicaid enrollment for CT with target 904321k
INFO:root:Targeting Medicaid enrollment for DC with target 240020k
INFO:root:Targeting Medicaid enrollment for DE with target 236840k
INFO:root:Targeting Medicaid enrollment for FL with target 3568648k
INFO:root:Targeting Medicaid enrollment for GA with target 1699279k
INFO:root:Targeting Medicaid enrollment for HI with target 376318k
INFO:root:Targeting Medicaid enrollment for IA with target 586748k
INFO:root:Targeting Medicaid enrollment for ID with target 296968k
INFO:root:Targeting Medicaid enrollment for IL with targ

In [None]:
from IPython.display import Markdown

ecps_outperforms_puf = merged["ecps_abs_rel_error_change_over_puf"] < 0
ecps_outperforms_cps = merged["ecps_abs_rel_error_change_over_cps"] < 0

Markdown(f"Overall, the ECPS outperforms the Census' CPS in **{ecps_outperforms_cps.mean():.1%}** of the targets and the IRS' PUF in **{ecps_outperforms_puf.mean():.1%}** of the targets.")

The below histogram shows the distribution of 'relative error change under the ECPS', comparing each metric's ECPS performance to the best of either the CPS or the PUF.

In [None]:
from utils import show
import plotly.express as px

clipped = merged["ecps_abs_rel_error_change_over_prev_best"].clip(-1, 1)

fig = px.histogram(clipped, nbins=100, title="ECPS improvement over best of CPS and PUF").update_layout(
    xaxis_range=[-1, 1],
    showlegend=False,
    xaxis_title="Absolute relative error change",
    xaxis_tickformat=".0%",
    yaxis_title="Count",
)
show(fig)

There is also a sparse model implementation of the model that was fit with L0 regularization. This was motivated by the method described in the paper [LEARNING SPARSE NEURAL NETWORKS
THROUGH L0 REGULARIZATION](https://arxiv.org/pdf/1712.01312). An example follows.

In [None]:
from policyengine_core.data import Dataset
from policyengine_us_data.storage import STORAGE_FOLDER

sparse_dataset = Dataset.from_file(STORAGE_FOLDER / "sparse_enhanced_cps_2024.h5")
sparse_sim = Microsimulation(dataset=sparse_dataset)
tip_estimate_sparse_model = sparse_sim.calculate("tip_income").sum()