In [49]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.api import OLS
import linearmodels as lm
from linearmodels.panel import PanelOLS

# ground-truth underlying model

there are 3 racecars, each of which has a different BHP-to-weight ratio (`slow/medium/fast`), each of whose drivers get a jolt of adrenaline in the homestretch (`initial/middle/final lap`), and each of which consumes fuel at a different rate. we want to understand the relationship between these variables and the racecar's lap speed.

- 3 entities ("slow", "med", "fast")
- 3 timesteps ("init", "mid", "fin")

in truth, we know that $E[\text{speed}_{i,t} \mid \text{is_fast}_i, \text{is_fin}_t, \text{fuel}_{i, t}] = \text{is_fast}_i + \text{is_fin}_t + \text{fuel}_{i, t}$.

# construct dataset

In [77]:
slow_x = pd.DataFrame(
    {"is_slow": {"init": 1, "mid": 1, "fin": 1},
     "is_med": {"init": 0, "mid": 0, "fin": 0},
     "is_fast": {"init": 0, "mid": 0, "fin": 0},
     "is_init": {"init": 1, "mid": 0, "fin": 0},
     "is_mid": {"init": 0, "mid": 1, "fin": 0},
     "is_fin": {"init": 0, "mid": 0, "fin": 1},
     "fuel": {"init": 1, "mid": 0.75, "fin": 0.5},
     "speed": {"init": 0 + 0 + 1 + np.random.normal(scale=0.01),
               "mid": 0 + 0 + 0.75 + np.random.normal(scale=0.01),
               "fin": 0 + 1 + 0.5 + np.random.normal(scale=0.01)}
    # rename because PanelOLS requires time variable to be numeric
    }).rename(index={"init": 0, "mid": 1, "fin": 2})
slow_x

Unnamed: 0,is_slow,is_med,is_fast,is_init,is_mid,is_fin,fuel,speed
0,1,0,0,1,0,0,1.0,0.997558
1,1,0,0,0,1,0,0.75,0.758721
2,1,0,0,0,0,1,0.5,1.512726


In [78]:
med_x = pd.DataFrame(
    {"is_slow": {"init": 0, "mid": 0, "fin": 0},
     "is_med": {"init": 1, "mid": 1, "fin": 1},
     "is_fast": {"init": 0, "mid": 0, "fin": 0},
     "is_init": {"init": 1, "mid": 0, "fin": 0},
     "is_mid": {"init": 0, "mid": 1, "fin": 0},
     "is_fin": {"init": 0, "mid": 0, "fin": 1},
     "fuel": {"init": 1, "mid": 0.66, "fin": 0.33},
     "speed": {"init": 0 + 0 + 1 + np.random.normal(scale=0.01),
               "mid": 0 + 0 + 0.66 + np.random.normal(scale=0.01),
               "fin": 0 + 1 + 0.33 + np.random.normal(scale=0.01)}
    # rename because PanelOLS requires time variable to be numeric
    }).rename(index={"init": 0, "mid": 1, "fin": 2})
med_x

Unnamed: 0,is_slow,is_med,is_fast,is_init,is_mid,is_fin,fuel,speed
0,0,1,0,1,0,0,1.0,1.013252
1,0,1,0,0,1,0,0.66,0.6608
2,0,1,0,0,0,1,0.33,1.314227


In [79]:
fast_x = pd.DataFrame(
    {"is_slow": {"init": 0, "mid": 0, "fin": 0},
     "is_med": {"init": 0, "mid": 0, "fin": 0},
     "is_fast": {"init": 1, "mid": 1, "fin": 1},
     "is_init": {"init": 1, "mid": 0, "fin": 0},
     "is_mid": {"init": 0, "mid": 1, "fin": 0},
     "is_fin": {"init": 0, "mid": 0, "fin": 1},
     "fuel": {"init": 1, "mid": 0.5, "fin": 0},
     "speed": {"init": 1 + 0 + 1 + np.random.normal(scale=0.01),
               "mid": 1 + 0 + 0.5 + np.random.normal(scale=0.01),
               "fin": 1 + 1 + 0 + np.random.normal(scale=0.01)}
    # rename because PanelOLS requires time variable to be numeric
    }).rename(index={"init": 0, "mid": 1, "fin": 2})
fast_x

Unnamed: 0,is_slow,is_med,is_fast,is_init,is_mid,is_fin,fuel,speed
0,0,0,1,1,0,0,1.0,2.003324
1,0,0,1,0,1,0,0.5,1.490619
2,0,0,1,0,0,1,0.0,1.986909


In [80]:
x = pd.concat([slow_x, med_x, fast_x], axis=0, keys=["slow", "med", "fast"])
x

Unnamed: 0,Unnamed: 1,is_slow,is_med,is_fast,is_init,is_mid,is_fin,fuel,speed
slow,0,1,0,0,1,0,0,1.0,0.997558
slow,1,1,0,0,0,1,0,0.75,0.758721
slow,2,1,0,0,0,0,1,0.5,1.512726
med,0,0,1,0,1,0,0,1.0,1.013252
med,1,0,1,0,0,1,0,0.66,0.6608
med,2,0,1,0,0,0,1,0.33,1.314227
fast,0,0,0,1,1,0,0,1.0,2.003324
fast,1,0,0,1,0,1,0,0.5,1.490619
fast,2,0,0,1,0,0,1,0.0,1.986909


# regress

## panel OLS estimate is good

In [81]:
fe_te_mod = PanelOLS(dependent=x["speed"], exog=x["fuel"], entity_effects=True, time_effects=True)
fe_te_res = fe_te_mod.fit()

print(fe_te_res.params)
print()
print(fe_te_res.tstats)

fuel    1.049268
Name: parameter, dtype: float64

fuel    23.522213
Name: tstat, dtype: float64


## vanilla OLS estimate is not good...

In [82]:
mod = OLS(endog=x["speed"], exog=x["fuel"])
res = mod.fit()

print(res.params)
print()
print(res.tvalues)

fuel    1.509588
dtype: float64

fuel    3.531628
dtype: float64


# replicate

## ... until we add the requisite structure to the "stacked" vanilla OLS and replicate PanelOLS's coefficient point estimates and t-stats!

note: we're able to replicate PanelOLS's t-stats only because we didn't specify a "sandwich" SE estimator for the PanelOLS. in practice, college classes teach you how to use entity- and time-clustered SE's properly, and then everybody definitely remembers how to use them forever.

In [86]:
rep_mod = OLS(endog=x["speed"], exog=x.loc[:, :"fuel"], hasconst=False)
rep_res = rep_mod.fit()

print(rep_res.params)
print()
print(rep_res.tvalues)

is_slow   -0.014802
is_med    -0.017440
is_fast    0.984798
is_init   -0.028742
is_mid    -0.015506
is_fin     0.996805
fuel       1.049268
dtype: float64

is_slow     -0.738249
is_med      -1.064854
is_fast     99.771865
is_init     -0.929936
is_mid      -1.015618
is_fin     166.818564
fuel        23.522213
dtype: float64
