# PyFixest Benchmarks

In [43]:
%load_ext autoreload
%autoreload 2

import time

import pandas as pd
from tqdm import tqdm  # note: tqdm is not a dependency of pyfixest

from pyfixest.estimation import feols, fepois


def run_standard_benchmark(model, fixed_effect):
    """
    Runs the fixest standard benchmark.
    Args:
        model (str): "feols" or "fepois"
        fixed_effect (str): "dum_1" or "dum_1+dum_2" or "dum_1+dum_2+dum_3"
    Returns:
        A pd.DataFrame with the results.
    """
    assert model in ["feols", "fepois"]
    assert fixed_effect in ["dum_1", "dum_1+dum_2", "dum_1+dum_2+dum_3"]

    # one fixed effect
    res = []

    if model == "feols":
        fml_base = "ln_y ~ X1"
        model2 = "Gaussian"
    else:
        fml_base = "y ~ X1"
        model2 = "Poisson"

    fml = f"{fml_base} | {fixed_effect}"

    # warmup
    df = pd.read_stata("./data/_STATA/base_s2_r1.dta")
    feols(fml, data=df)

    for size in tqdm(range(1, 6)):
        if size == 5:
            if model == "fepois":
                pass
            else:
                df = pd.read_csv("./data/data/base_10M.csv")

        for rep in range(1, 11):
            if size < 5:
                df = pd.read_stata(f"./data/_STATA/base_s{size}_r{rep}.dta")

            tic = time.time()
            if model == "feols":
                feols(fml, data=df)
            else:
                fepois(fml, data=df)
            toc = time.time()

            res.append(
                pd.Series(
                    {
                        "method": model,
                        "n_obs": df.shape[0],
                        "G": len(fixed_effect.split("+")),
                        "rep": rep,
                        "time": toc - tic,
                    }
                )
            )

    return pd.concat(res, axis=1).T


def run_all_benchmarks():
    """
    Run all the benchmarks.
    """
    res = pd.DataFrame()
    for model in ["feols", "fepois"]:
        for fixef in ["dum_1", "dum_1+dum_2", "dum_1+dum_2+dum_3"]:
            res = pd.concat([res, run_standard_benchmark(model, fixef)], axis=1)

    res.to_csv("./results_py.csv")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
run_all_benchmarks()

100%|██████████| 3/3 [00:00<00:00,  3.54it/s]
100%|██████████| 3/3 [00:00<00:00,  3.33it/s]


In [39]:
a = run_standard_benchmark("feols", "dum_1")
a

100%|██████████| 3/3 [00:00<00:00,  3.61it/s]


Unnamed: 0,method,n_obs,G,rep,time
0,feols,1000,1,1,0.011204
1,feols,1000,1,2,0.009486
2,feols,1000,1,3,0.010987
3,feols,1000,1,4,0.010687
4,feols,1000,1,5,0.011018
5,feols,1000,1,6,0.009798
6,feols,1000,1,7,0.008976
7,feols,1000,1,8,0.008977
8,feols,1000,1,9,0.008486
9,feols,1000,1,10,0.007978


In [21]:
a.T

Unnamed: 0,method,n_obs,G,rep,time,method.1,n_obs.1,G.1,rep.1,time.1,...,method.2,n_obs.2,G.2,rep.2,time.2,method.3,n_obs.3,G.3,rep.3,time.3
0,feols,1000,1,1,0.009973,feols,1000,1,2,0.01043,...,feols,10000000,2,9,10.188134,feols,10000000,2,10,10.136924


## Visualisation

In [5]:
res_all = pd.concat(
    [
        pd.read_csv("./benchmarks/results_py.csv"),
        pd.read_csv("./benchmarks/results_all.txt"),
    ]
)

In [6]:
res_all

Unnamed: 0,method,n_obs,G,rep,time,model
0,fepois,1000.0,1,1,0.060000,Poisson
1,glmmboot,1000.0,1,1,0.020000,Poisson
2,feglm (alpaca),1000.0,1,1,0.020000,Poisson
3,fepois,1000.0,1,2,0.030000,Poisson
4,glmmboot,1000.0,1,2,0.010000,Poisson
...,...,...,...,...,...,...
1575,FixedEffectModels,10000000.0,3,6,6.800669,Gaussian
1576,FixedEffectModels,10000000.0,3,7,6.756505,Gaussian
1577,FixedEffectModels,10000000.0,3,8,6.802480,Gaussian
1578,FixedEffectModels,10000000.0,3,9,6.761793,Gaussian
