In [16]:
import re
import pandas as pd
import pyfixest as pf

# Load the dataset
psid = pd.read_csv(
    "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/refs/heads/master/csv/AER/PSID7682.csv"
)

# Prepare data
psid["experience"] = pd.Categorical(psid["experience"])
psid["year"] = pd.Categorical(psid["year"])

# Fit the full regression model
fit_full = pf.feols(
    "log(wage) ~ gender + ethnicity + education + experience + occupation + industry + year",
    data=psid,
)

# Perform Gelbach decomposition
gb = fit_full.decompose(param="gender[T.male]", digits=5)

# Generate the final table
table = pf.make_table(gb)
print(table)

gb_combined = fit_full.decompose(
    param="gender[T.male]",
    combine_covariates={
        "job": re.compile(r".*(occupation|industry).*"),
        "personal": re.compile(r".*(education|experience|ethnicity).*"),
        "year": re.compile("year"),
    },
)
table_combined = pf.make_table(gb_combined)
print(table_combined)



100%|██████████| 1000/1000 [00:04<00:00, 226.15it/s]


GT(_tbl_data=                  index       direct_effect         full_effect  \
0        gender[T.male]             0.47447             0.41034   
1                        [0.46030, 0.51865]  [0.39091, 0.43884]   
2    ethnicity[T.other]                                           
3                                                                 
4             education                                           
..                  ...                 ...                 ...   
117                                                               
118        year[T.1981]                                           
119                                                               
120        year[T.1982]                                           
121                                                               

        explained_effect  
0                0.06413  
1     [0.03465, 0.10798]  
2                0.02275  
3     [0.01633, 0.03188]  
4                0.00064  
..                  

100%|██████████| 1000/1000 [00:01<00:00, 674.72it/s]


GT(_tbl_data=            index     direct_effect       full_effect   explained_effect
0  gender[T.male]            0.4745            0.4103             0.0641
1                  [0.4441, 0.5090]  [0.3739, 0.4360]   [0.0329, 0.0952]
2             job                                                 0.0017
3                                                      [-0.0068, 0.0144]
4        personal                                                 0.0624
5                                                       [0.0458, 0.0749]
6            year                                                 0.0000
7                                                      [-0.0143, 0.0120], _body=<great_tables._gt_data.Body object at 0x12d61c250>, _boxhead=Boxhead([ColInfo(var='index', type=<ColInfoTypeEnum.stub: 2>, column_label='index', column_align='center', column_width=None), ColInfo(var='direct_effect', type=<ColInfoTypeEnum.default: 1>, column_label='direct_effect', column_align='center', column_width=None)

In [17]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Load dataset
psid = pd.read_csv(
    "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/refs/heads/master/csv/AER/PSID7682.csv"
)

# Create the log_wage column
psid["log_wage"] = np.log(psid["wage"])

# Convert gender to a 0/1 variable: 1 for male, 0 for female
psid["male"] = (psid["gender"] == "male").astype(int)

def gelbach_decomp_multi(data, y, x1, covariates):
    """
    Covariates can be a list of factor or numeric columns, e.g. ['C(occupation)', 'education'].
    We'll expand them to match the actual dummy columns created by statsmodels.
    """

    # 1. Short model
    short_model = smf.ols(f"{y} ~ {x1}", data=data).fit()
    b1_short = short_model.params[x1]

    # 2. Long model
    full_formula = f"{y} ~ {x1} + {' + '.join(covariates)}"
    long_model = smf.ols(full_formula, data=data).fit()
    b1_long = long_model.params[x1]
    explained_effect = b1_short - b1_long

    # 3. Identify *all* parameters from the long model except x1 and the intercept
    param_names = [p for p in long_model.params.index if p not in [x1, "Intercept"]]

    # We'll store partial contributions at the level of each dummy column
    partial_contrib = {}

    for p_name in param_names:
        # p_name might be something like "C(occupation)[T.Clerical]"
        # or "education" if numeric, etc.

        # 3a. Coefficient from the long model
        b2_p = long_model.params[p_name]

        # 3b. Auxiliary regression x1 ~ that dummy column alone
        # We need that column in the data. statsmodels has built it internally,
        # so we replicate that encoding manually via patsy or get_dummies.
        # For a quick hack, we can do:
        data_temp = data.copy()
        data_temp["__dummy__"] = long_model.model.exog[:, long_model.model.exog_names.index(p_name)]
        aux_model = smf.ols(f"{x1} ~ __dummy__", data=data_temp).fit()
        gamma_k = aux_model.params["__dummy__"]

        partial_contrib[p_name] = gamma_k * b2_p

    # 4. If you want group-level sums, e.g. for "occupation" or "industry", just sum them:
    group_contrib = {}
    for cov in covariates:
        # e.g. cov might be "C(occupation)" or "C(year)"
        # we gather all partial_contrib keys that start with that factor name
        group_contrib[cov] = sum(val for key, val in partial_contrib.items() if key.startswith(cov))

    return {
        "direct_effect": b1_short,
        "full_effect": b1_long,
        "explained_effect": explained_effect,
        "partial_contrib": partial_contrib,  # each dummy
        "group_contrib": group_contrib       # aggregated
    }


# --- Run the decomposition with a simple set of covariates --- #
x2_simple = ["C(ethnicity)", "education", "experience"]
result_simple = gelbach_decomp(
    data=psid,
    y="log_wage",
    x1="male",
    x2=x2_simple
)

print("=== Simple Decomposition ===")
print("Direct effect (male):", result_simple["direct_effect"])
print("Full effect (male):", result_simple["full_effect"])
print("Explained effect:", result_simple["explained_effect"])
print("Contributions:", result_simple["contributions"])

x2_full = [
    "C(ethnicity)",
    "education",
    "experience",
    "C(occupation)",
    "C(industry)",
    "C(year)"
]

result_full = gelbach_decomp(
    data=psid,
    y="log_wage",
    x1="male",
    x2=x2_full
)

print("\n=== Full Decomposition ===")
print("Direct effect (male):", result_full["direct_effect"])
print("Full effect (male):", result_full["full_effect"])
print("Explained effect:", result_full["explained_effect"])
print("Contributions:", result_full["contributions"])


=== Simple Decomposition ===
Direct effect (male): 0.4744662947566457
Full effect (male): 0.40815296641721865
Explained effect: 0.06631332833942705
Contributions: {'C(ethnicity)': 0.0, 'education': np.float64(1.0198629169616293e-05), 'experience': np.float64(3.1805222826660014e-05)}

=== Full Decomposition ===
Direct effect (male): 0.4744662947566457
Full effect (male): 0.42159565900045654
Explained effect: 0.052870635756189166
Contributions: {'C(ethnicity)': 0.0, 'education': np.float64(8.173082158478142e-06), 'experience': np.float64(2.165781458291908e-05), 'C(occupation)': 0.0, 'C(industry)': 0.0, 'C(year)': 0.0}


In [23]:
import re
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices

#########################
# 1) Helper: Extract colnames
#########################
def _extract_colnames_from_formula(formula):
    """
    Given a formula like 'y ~ C(gender) + education + C(occupation)',
    return the list ['y', 'gender', 'education', 'occupation'].
    """
    import re
    lhs, rhs = formula.split("~")
    rhs_terms = re.split(r"\+", rhs)
    rhs_terms = [x.strip() for x in rhs_terms]
    all_terms = [lhs.strip()] + rhs_terms

    # Remove any 'C(...)' wrappers using a regex, e.g. 'C(gender)' -> 'gender'
    colnames = []
    pattern = re.compile(r"C\((.+?)\)")
    for term in all_terms:
        match = pattern.search(term)
        if match:
            colnames.append(match.group(1))
        else:
            colnames.append(term)
    return list(set(colnames))  # ensure uniqueness

#########################
# 2) Main Gelbach Function
#########################
def gelbach_decomp_statsmodels(
    data,
    formula_short,           # e.g. "log_wage ~ C(gender)"
    formula_long,            # e.g. "log_wage ~ C(gender) + C(ethnicity) + ..."
    param,                   # e.g. "C(gender)[T.male]"
    combine_covariates=None, # dict of {group_name: regex_pattern}
    dropna_cols=None
):
    """
    Perform a Gelbach decomposition using statsmodels + patsy expansions.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame containing all variables.
    formula_short : str
        Patsy formula for the short model (e.g. 'log_wage ~ C(gender)').
    formula_long : str
        Patsy formula for the long model (e.g. 'log_wage ~ C(gender) + ...').
    param : str
        The coefficient name from the long model to decompose, e.g. "C(gender)[T.male]".
    combine_covariates : dict, optional
        A dictionary grouping columns by a regex, e.g.:
          {
            "job": re.compile(r".*(occupation|industry).*"),
            "personal": re.compile(r".*(education|experience|ethnicity).*"),
            "year": re.compile("year"),
          }
        Each expanded column’s contribution is summed into these named groups.
    dropna_cols : list, optional
        List of real column names to drop if they have NA. If None, derived from the formula.

    Returns
    -------
    dict
        {
            "direct_effect": float,
            "full_effect": float,
            "explained_effect": float,
            "contributions": {group_or_column_name: contribution}
        }
    """

    # 1. Determine columns for NA-dropping
    if dropna_cols is None:
        short_cols = _extract_colnames_from_formula(formula_short)
        long_cols  = _extract_colnames_from_formula(formula_long)
        dropna_cols = list(set(short_cols + long_cols))

    # 2. Drop any rows with missing data in those columns
    data_clean = data.dropna(subset=dropna_cols).copy()

    # 3. Fit the short model
    short_fit = smf.ols(formula_short, data=data_clean).fit()
    if param not in short_fit.params.index:
        raise ValueError(f"Parameter '{param}' not in short model. "
                         f"Short model columns: {list(short_fit.params.index)}")

    b1_short = short_fit.params[param]

    # 4. Fit the long model
    long_fit = smf.ols(formula_long, data=data_clean).fit()
    if param not in long_fit.params.index:
        raise ValueError(f"Parameter '{param}' not in long model. "
                         f"Long model columns: {list(long_fit.params.index)}")

    b1_long = long_fit.params[param]

    # 5. Explained effect
    explained_effect = b1_short - b1_long

    # 6. Expand the design matrix for the long formula
    #    This yields each dummy column for factor variables
    y_matrix, X_long = dmatrices(formula_long, data=data_clean, return_type="dataframe")
    param_series = X_long[param]

    contributions = {}
    for col in X_long.columns:
        # Skip intercept & the param column itself
        if col == "Intercept" or col == param:
            continue

        # Auxiliary regression: param_series ~ col
        aux_X = sm.add_constant(X_long[[col]], prepend=True)
        aux_fit = sm.OLS(param_series, aux_X).fit()

        gamma_k = aux_fit.params.get(col, 0.0)
        beta_k = long_fit.params.get(col, 0.0)
        contrib_value = gamma_k * beta_k

        # 7. If grouping, sum them
        if combine_covariates is not None:
            matched_group = None
            for group_name, pattern in combine_covariates.items():
                if pattern.match(col):
                    matched_group = group_name
                    break
            if matched_group is not None:
                contributions[matched_group] = contributions.get(matched_group, 0.0) + contrib_value
                continue

        # If no grouping or no match, store individually
        contributions[col] = contrib_value

    return {
        "direct_effect": b1_short,
        "full_effect": b1_long,
        "explained_effect": explained_effect,
        "contributions": contributions
    }

#########################
# 3) Run everything
#########################
if __name__ == "__main__":
    # -----------------------------------------------------------
    # Load and prep the PSID data similarly to pyfixest usage
    # -----------------------------------------------------------
    psid = pd.read_csv(
        "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/refs/heads/master/csv/AER/PSID7682.csv"
    )
    # Log wage
    psid["log_wage"] = np.log(psid["wage"])

    # Convert to categorical with alphabetical ordering
    # so that reference categories match typical fixest defaults
    for col in ["gender","ethnicity","experience","occupation","industry","year"]:
        psid[col] = psid[col].astype("category")
        # Sort categories alphabetically (in-place reordering)
        sorted_cats = sorted(psid[col].cat.categories)
        psid[col] = psid[col].cat.reorder_categories(sorted_cats, ordered=False)

    # -----------------------------------------------------------
    # Define formulas
    # -----------------------------------------------------------
    #  - short model includes only gender
    #  - long model includes the same covariates as pyfixest
    formula_short = "log_wage ~ C(gender)"
    formula_long = (
        "log_wage ~ C(gender) + C(ethnicity) + education + C(experience) "
        "+ C(occupation) + C(industry) + C(year)"
    )

    # param we want to decompose (this matches how patsy names it)
    param_of_interest = "C(gender)[T.male]"  

    # Combine covariates by regex, same as you do in pyfixest
    combine_covs = {
        "job": re.compile(r".*(occupation|industry).*"),
        "personal": re.compile(r".*(education|experience|ethnicity).*"),
        "year": re.compile("year"),
    }

    # -----------------------------------------------------------
    # Run Gelbach Decomposition
    # -----------------------------------------------------------
    result = gelbach_decomp_statsmodels(
        data=psid,
        formula_short=formula_short,
        formula_long=formula_long,
        param=param_of_interest,
        combine_covariates=combine_covs,
        dropna_cols=None  # auto-derived from formula
    )

    # -----------------------------------------------------------
    # Print results
    # -----------------------------------------------------------
    print("=== Gelbach Decomposition ===")
    print(f"Direct effect ({param_of_interest}): {result['direct_effect']:.5f}")
    print(f"Full effect   ({param_of_interest}): {result['full_effect']:.5f}")
    print(f"Explained effect: {result['explained_effect']:.5f}\n")

    print("Contributions (grouped) =>")
    for k, v in result["contributions"].items():
        print(f"  {k}: {v:.5f}")


=== Gelbach Decomposition ===
Direct effect (C(gender)[T.male]): 0.47447
Full effect   (C(gender)[T.male]): 0.41034
Explained effect: 0.06413

Contributions (grouped) =>
  personal: -0.18866
  job: 0.00101
  C(year)[T.1977]: -0.00000
  C(year)[T.1978]: -0.00000
  C(year)[T.1979]: -0.00000
  C(year)[T.1980]: -0.00000
  C(year)[T.1981]: -0.00000
  C(year)[T.1982]: -0.00000
