# -----------------------------------------------------------------------
# Letra A

## Estimation for Fixed Effects


Let's estimate a fixed effects model.

$$
y_{it} = x_{it}'\beta + \alpha_i + \varepsilon_{it}
$$


### Fixed Effects Model

We will regress the probability that an individual commits a crime on the log of the probability of being arrested, on the log of the probability of conviction, on the log of the probability of receiving a prison sentence, on the log of the average prison sentence length, and finally on the log of policing per capita.

$$
\tilde{Y}_{it} = \tilde{X}_{it}'\beta + \tilde{\varepsilon}_{it}
$$

Let's calculate:

$$
\hat{\beta}_{FE} = \left ( \sum^{n}_{i=1}\sum_{t} \tilde{X}_{it} \tilde{X}_{it}' \right )^{-1} \left ( \sum^{n}_{i=1}\sum_{t} \tilde{X}_{it} \tilde{y}_{it} \right )
$$

### Date import

In [1]:
import pandas as pd
import numpy as np
import wooldridge as woo
import statsmodels.api as sm
from linearmodels.panel import PooledOLS
from linearmodels.panel import PanelOLS
import linearmodels as plm
from linearmodels import RandomEffects
from linearmodels.panel import RandomEffects
from scipy import stats

In [2]:
df = pd.read_stata("cornwell.dta")
df = df.set_index(["county", "year"], drop=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.697630,0,1,...,-2.433870,3.006608,,,,,,,,
1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.874252,0,1,...,-2.449038,3.006608,-0.039376,0.154542,0.074143,0.071048,-0.003571,-0.011364,-0.032565,0.030857
1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.451443,0,1,...,-2.464036,3.006608,-0.235316,-0.022922,0.193987,-0.055326,0.036879,0.038413,0.061477,-0.244732
1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.842348,0,1,...,-2.478925,3.006608,0.136180,0.092641,0.140006,0.080857,0.172213,0.026930,0.014670,-0.027331
1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.140337,0,1,...,-2.497306,3.006608,0.051825,-0.108054,-0.043918,-0.045320,-0.050606,0.020199,0.047223,0.172125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.905853,1,0,...,-2.538060,1.697597,-0.148666,-0.010969,-0.127018,0.164303,0.157158,0.149330,0.070461,0.020250
197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.704754,1,0,...,-2.548068,1.697597,-0.131037,-0.104441,1.076927,-0.139610,0.263255,0.032795,0.183103,0.026842
197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.123611,1,0,...,-2.561072,1.697597,-0.043091,-0.123000,-0.529178,-0.111704,-0.528454,-0.050473,0.060617,-0.366374
197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.981979,1,0,...,-2.580968,1.697597,-0.016311,-0.471524,0.996334,-0.310156,0.253549,0.031580,0.034964,-0.067911


In [3]:
df.columns

Index(['crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen', 'polpc', 'density',
       'taxpc', 'west', 'central', 'urban', 'pctmin80', 'wcon', 'wtuc', 'wtrd',
       'wfir', 'wser', 'wmfg', 'wfed', 'wsta', 'wloc', 'mix', 'pctymle', 'd82',
       'd83', 'd84', 'd85', 'd86', 'd87', 'lcrmrte', 'lprbarr', 'lprbconv',
       'lprbpris', 'lavgsen', 'lpolpc', 'ldensity', 'ltaxpc', 'lwcon', 'lwtuc',
       'lwtrd', 'lwfir', 'lwser', 'lwmfg', 'lwfed', 'lwsta', 'lwloc', 'lmix',
       'lpctymle', 'lpctmin', 'clcrmrte', 'clprbarr', 'clprbcon', 'clprbpri',
       'clavgsen', 'clpolpc', 'cltaxpc', 'clmix'],
      dtype='object')

In [4]:
# Calcular a média ao longo do tempo PARA CADA COUNTY (CONDADO) 
df_mean = df.groupby('county').mean()
df_mean # cada linha um condado e cada coluna a média temporal de cada variável

Unnamed: 0_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
3,0.014936,0.176669,0.997528,0.427240,7.404286,0.000661,1.014341,24.055300,0.0,1.0,...,-2.463531,2.068926,-0.012002,-0.071614,0.088900,-0.005735,-0.047619,0.037970,0.102249,-0.103135
5,0.012567,0.537032,0.390403,0.427434,7.030000,0.001243,0.414590,26.782335,1.0,0.0,...,-2.611558,1.150740,0.054648,0.014835,-0.001495,0.030387,0.033472,0.067988,0.104212,0.286573
7,0.023045,0.418395,0.573859,0.412003,7.812857,0.001467,0.489949,43.795879,0.0,1.0,...,-2.541019,3.869452,0.033240,-0.027848,-0.029470,0.001581,0.028992,0.010949,0.019329,-0.030535
9,0.011378,0.480105,0.583061,0.408591,8.418571,0.000850,0.541583,22.113031,1.0,0.0,...,-2.589735,0.585668,0.057628,-0.032971,-0.025562,0.023969,0.025827,0.004890,0.084936,-0.065799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0.028850,0.192302,0.403041,0.348085,11.118571,0.002026,1.076433,26.397387,1.0,0.0,...,-1.863639,0.684712,-0.000820,-0.019709,-0.002077,0.001595,-0.028863,0.007781,0.061081,0.005904
191,0.037461,0.238157,0.369684,0.438734,9.270000,0.001194,1.769727,23.442673,0.0,0.0,...,-2.340729,3.538879,0.035364,-0.075250,0.050018,-0.034123,-0.039846,0.012371,0.094527,-0.035777
193,0.020501,0.339669,0.502536,0.463439,6.688571,0.001037,0.801102,22.723814,1.0,0.0,...,-2.497699,1.780208,0.033785,-0.063744,0.053580,-0.050618,0.038906,0.040236,0.081869,-0.053717
195,0.045657,0.220812,0.837039,0.473233,11.207143,0.002903,1.720397,33.712444,0.0,0.0,...,-2.470089,3.622502,-0.116389,-0.017483,0.290986,0.007456,0.042160,0.155238,0.129819,0.040086


In [5]:
# Subtrair as médias temporais
df_mean_expanded = df_mean.reindex(df.index, level = "county")
df_mean_expanded

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
1,82,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
1,83,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
1,84,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
1,85,0.035741,0.324358,0.512017,0.478874,6.292857,0.001846,2.356978,27.534382,0.0,1.0,...,-2.485800,3.006608,-0.018925,0.004861,0.045287,-0.013236,0.029841,0.003788,0.031231,-0.036702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015046,0.188538,1.108684,0.351408,8.820000,0.001038,0.872874,21.561247,1.0,0.0,...,-2.553578,1.697597,-0.038324,0.044537,0.018134,0.002093,0.012446,0.059289,0.074073,0.015064
197,84,0.015046,0.188538,1.108684,0.351408,8.820000,0.001038,0.872874,21.561247,1.0,0.0,...,-2.553578,1.697597,-0.038324,0.044537,0.018134,0.002093,0.012446,0.059289,0.074073,0.015064
197,85,0.015046,0.188538,1.108684,0.351408,8.820000,0.001038,0.872874,21.561247,1.0,0.0,...,-2.553578,1.697597,-0.038324,0.044537,0.018134,0.002093,0.012446,0.059289,0.074073,0.015064
197,86,0.015046,0.188538,1.108684,0.351408,8.820000,0.001038,0.872874,21.561247,1.0,0.0,...,-2.553578,1.697597,-0.038324,0.044537,0.018134,0.002093,0.012446,0.059289,0.074073,0.015064


In [6]:
# Criar variáveis transformadas: y_it - ȳ_i  e  X_it - X̄_i
vars_all = ["lcrmrte", "lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc"]

df_within = df[vars_all] - df_mean_expanded[vars_all]

In [7]:
# Variável dependente transformada
y_within = df_within["lcrmrte"]

# Regressoras transformadas
X_within = df_within[["lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc"]]

In [8]:
# Calcular coeficientes na "mão"

# Transpor matriz
X_within_transpose = np.transpose(X_within)


# Calcular os coeficientes estimados por OLS (X'X)^{-1}%*%(X'y) 
beta = np.dot(np.linalg.inv(np.dot(X_within_transpose, X_within)), np.dot(X_within_transpose, y_within))
beta

array([-0.38353664, -0.30597535, -0.19545129,  0.03566426,  0.41377085],
      dtype=float32)

In [9]:
# Verificar OLS com pacote
model_manual = sm.OLS(y_within, X_within).fit()

print(model_manual.summary())

                                 OLS Regression Results                                
Dep. Variable:                lcrmrte   R-squared (uncentered):                   0.359
Model:                            OLS   Adj. R-squared (uncentered):              0.354
Method:                 Least Squares   F-statistic:                              70.01
Date:                Fri, 05 Dec 2025   Prob (F-statistic):                    4.00e-58
Time:                        14:46:28   Log-Likelihood:                          366.25
No. Observations:                 630   AIC:                                     -722.5
Df Residuals:                     625   BIC:                                     -700.3
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

The probability of being arrested, the probability of conviction, and the probability of receiving a sentence all negatively influence the probability of committing a crime.

The only coefficient that is not statistically significant is the average sentence length.

All coefficients can be interpreted as elasticities. A 1% increase in the regressor is associated with a β% change in the crime rate.

Increasing the average sentence length does not significantly affect the likelihood of someone committing crimes.

For the first coefficient, a 1% increase in the probability of being arrested reduces the probability of committing a crime by 0.38%.

# -----------------------------------------------------------------------
# Letra B

## Random Effects

Now the model becames:


$$
y_{it} = X_{it}\beta + \alpha_i + \varepsilon_{it}
$$

$$
\hat{\beta}_{GLS} = \left ( \sum^{N}_{i=1} X_{it}' \hat{\Omega}_i^{-1} X_{it} \right )^{-1} \left ( \sum^{N}_{i=1} X_{it}' \hat{\Omega}_i^{-1} y_{i}\right)
$$


In [10]:
df = pd.read_stata("cornwell.dta")
df = df.set_index(["county", "year"], drop=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.697630,0,1,...,-2.433870,3.006608,,,,,,,,
1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.874252,0,1,...,-2.449038,3.006608,-0.039376,0.154542,0.074143,0.071048,-0.003571,-0.011364,-0.032565,0.030857
1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.451443,0,1,...,-2.464036,3.006608,-0.235316,-0.022922,0.193987,-0.055326,0.036879,0.038413,0.061477,-0.244732
1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.842348,0,1,...,-2.478925,3.006608,0.136180,0.092641,0.140006,0.080857,0.172213,0.026930,0.014670,-0.027331
1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.140337,0,1,...,-2.497306,3.006608,0.051825,-0.108054,-0.043918,-0.045320,-0.050606,0.020199,0.047223,0.172125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.905853,1,0,...,-2.538060,1.697597,-0.148666,-0.010969,-0.127018,0.164303,0.157158,0.149330,0.070461,0.020250
197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.704754,1,0,...,-2.548068,1.697597,-0.131037,-0.104441,1.076927,-0.139610,0.263255,0.032795,0.183103,0.026842
197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.123611,1,0,...,-2.561072,1.697597,-0.043091,-0.123000,-0.529178,-0.111704,-0.528454,-0.050473,0.060617,-0.366374
197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.981979,1,0,...,-2.580968,1.697597,-0.016311,-0.471524,0.996334,-0.310156,0.253549,0.031580,0.034964,-0.067911


In [11]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.697630,0,1,...,-2.433870,3.006608,,,,,,,,
1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.874252,0,1,...,-2.449038,3.006608,-0.039376,0.154542,0.074143,0.071048,-0.003571,-0.011364,-0.032565,0.030857
1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.451443,0,1,...,-2.464036,3.006608,-0.235316,-0.022922,0.193987,-0.055326,0.036879,0.038413,0.061477,-0.244732
1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.842348,0,1,...,-2.478925,3.006608,0.136180,0.092641,0.140006,0.080857,0.172213,0.026930,0.014670,-0.027331
1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.140337,0,1,...,-2.497306,3.006608,0.051825,-0.108054,-0.043918,-0.045320,-0.050606,0.020199,0.047223,0.172125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.905853,1,0,...,-2.538060,1.697597,-0.148666,-0.010969,-0.127018,0.164303,0.157158,0.149330,0.070461,0.020250
197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.704754,1,0,...,-2.548068,1.697597,-0.131037,-0.104441,1.076927,-0.139610,0.263255,0.032795,0.183103,0.026842
197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.123611,1,0,...,-2.561072,1.697597,-0.043091,-0.123000,-0.529178,-0.111704,-0.528454,-0.050473,0.060617,-0.366374
197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.981979,1,0,...,-2.580968,1.697597,-0.016311,-0.471524,0.996334,-0.310156,0.253549,0.031580,0.034964,-0.067911


In [12]:
# Ajuste: garantir colunas 'county' e 'year' existirem (ou extrair do índice)
if ("county" not in df.columns) or ("year" not in df.columns):
    # se o índice for MultiIndex (county, year)
    if isinstance(df.index, pd.MultiIndex):
        df = df.reset_index()  # traz county, year como colunas
    else:
        raise ValueError("DataFrame precisa ter colunas 'county' e 'year' ou ser MultiIndex (county,year).")

# variáveis do modelo
y_name = "lcrmrte"
x_names = ["lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc"]

# ordenar por county, year
df = df.sort_values(["county", "year"]).reset_index(drop=True)



In [13]:
# N, T
N = df["county"].nunique()
T = df.groupby("county")["year"].nunique().mode().iloc[0]   # assume balanced panel; ajuste se necessary
K = len(x_names)  # número de regressores (sem constante)

# --- 1) ESTIMADOR WITHIN (FE) manual: demean por county e OLS das transformed vars ---
# calcular médias por county
means = df.groupby("county")[[y_name] + x_names].transform("mean")

In [14]:
# within (demeaned)
df_within = df[[y_name] + x_names] - means

# regressão OLS sem constante sobre as variáveis demeaned
y_within = df_within[y_name]
X_within = df_within[x_names]
fe_within = sm.OLS(y_within, X_within).fit()

# residuos within (usados para sigma_e)
resid_within = fe_within.resid

# estimar sigma_e^2
# # grau de liberdade sugerido: N*(T-1) - K pelas notas de aula
df_dof_e = N * (T - 1) - K
sigma_e2 = (resid_within**2).sum() / df_dof_e
# garantir não-negatividade
sigma_e2 = float(sigma_e2)

In [15]:
# BETWEEN estimator: regressão das médias por county ---
group_means = df.groupby("county")[[y_name] + x_names].mean()
Y_bar = group_means[y_name]
X_bar = group_means[x_names]

In [16]:
# adicionar constante ao between (aqui usamos OLS com constante)
Xb = sm.add_constant(X_bar)
Xb

Unnamed: 0_level_0,const,lprbarr,lprbconv,lprbpris,lavgsen,lpolpc
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.0,-1.128344,-0.678477,-0.738277,1.835435,-6.295127
3,1.0,-1.744634,-0.022630,-0.856210,1.987864,-7.323944
5,1.0,-0.646785,-1.047950,-0.891884,1.917702,-6.704923
7,1.0,-0.891184,-0.576678,-0.910456,2.030138,-6.525218
9,1.0,-0.750425,-0.545380,-0.897885,2.116307,-7.071893
...,...,...,...,...,...,...
189,1.0,-1.669677,-0.957528,-1.097128,2.364090,-6.205800
191,1.0,-1.446865,-1.003777,-0.828621,2.215291,-6.730804
193,1.0,-1.085446,-0.708932,-0.779676,1.883593,-6.875694
195,1.0,-1.523761,-0.510981,-0.748827,2.408579,-5.937487


In [17]:
be_between = sm.OLS(Y_bar, Xb).fit()
print(be_between.summary())

                            OLS Regression Results                            
Dep. Variable:                lcrmrte   R-squared:                       0.710
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     41.11
Date:                Fri, 05 Dec 2025   Prob (F-statistic):           3.47e-21
Time:                        14:46:46   Log-Likelihood:                -17.692
No. Observations:                  90   AIC:                             47.38
Df Residuals:                      84   BIC:                             62.38
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5155      0.706     -2.146      0.0

In [18]:
Y_bar

county
1     -3.334696
3     -4.214238
5     -4.384351
7     -3.778031
9     -4.500291
         ...   
189   -3.549659
191   -3.290085
193   -3.894663
195   -3.151571
197   -4.205367
Name: lcrmrte, Length: 90, dtype: float32

In [19]:
resid_between = be_between.resid

# --- estimar sigma_a^2 conforme notas de aula ---
# SSR_between / (N - (K+1))  menos sigma_e2 / T
df_dof_a = N - (K + 1)   # se usamos constante no between, K+1 parâmetros
SSR_between = (resid_between**2).sum()
sigma_a2_hat = SSR_between / df_dof_a - sigma_e2 / T
# truncar se negativo
sigma_a2_hat = max(0.0, float(sigma_a2_hat))

In [20]:
sigma_a2_hat

0.08986708985722343

In [21]:
# calcular lambda (λ̂) conforme as notas de aula: λ̂ = 1 - sqrt( σ_e^2 / (T σ_a^2 + σ_e^2) ) 
den = sigma_e2 + T * sigma_a2_hat
if den <= 0:
    lam = 0.0
else:
    lam = 1.0 - np.sqrt(sigma_e2 / den)

# transformar variáveis (quasi-GLS): y - λ y_bar_i , X - λ X_bar_i ---
# expandir as médias para o formato original (cada observação)
means_expanded = df.groupby("county")[[y_name] + x_names].transform("mean")
y_re = df[y_name] - lam * means_expanded[y_name]
X_re = df[x_names] - lam * means_expanded[x_names]

# adicionar constante (no RE a constante)
#X_re_const = sm.add_constant(X_re)

# OLS nas variáveis transformadas => estimador RE (GLS) manual ---
re_manual = sm.OLS(y_re, X_re).fit()

In [24]:
print(re_manual.summary())

                                 OLS Regression Results                                
Dep. Variable:                lcrmrte   R-squared (uncentered):                   0.939
Model:                            OLS   Adj. R-squared (uncentered):              0.939
Method:                 Least Squares   F-statistic:                              1929.
Date:                Wed, 03 Dec 2025   Prob (F-statistic):                        0.00
Time:                        21:56:46   Log-Likelihood:                          225.64
No. Observations:                 630   AIC:                                     -441.3
Df Residuals:                     625   BIC:                                     -419.1
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [22]:
# comparar com RandomEffects do linearmodels ---
# precisamos do painel index (county, year) para linearmodels
df_panel = df.set_index(["county", "year"])
re_lm = RandomEffects(df_panel[y_name], df_panel[x_names]).fit()

# ---  imprimir resultados e comparação ---
print("=== Resumo dos passos e estimativas ===")
print(f"N = {N}, T = {T}, K = {K}")
print(f"sigma_e^2 (from FE residuals) = {sigma_e2:.6g}")
print(f"sigma_a^2 (estimated from BETWEEN) = {sigma_a2_hat:.6g}")
print(f"lambda_hat = {lam:.6g}\n")

print("---- Coeficientes: RE manual (GLS via transformação) ----")
print(re_manual.params)
print("\n---- Coeficientes: RandomEffects (linearmodels) ----")
print(re_lm.params)

print("\nDiferença (manual - linearmodels):")
print(re_manual.params - re_lm.params)

=== Resumo dos passos e estimativas ===
N = 90, T = 7, K = 5
sigma_e^2 (from FE residuals) = 0.0215555
sigma_a^2 (estimated from BETWEEN) = 0.0898671
lambda_hat = 0.817982

---- Coeficientes: RE manual (GLS via transformação) ----
lprbarr    -0.436484
lprbconv   -0.401534
lprbpris   -0.128234
lavgsen    -0.062246
lpolpc      0.670338
dtype: float32

---- Coeficientes: RandomEffects (linearmodels) ----
lprbarr    -0.434564
lprbconv   -0.400164
lprbpris   -0.129210
lavgsen    -0.061275
lpolpc      0.669860
Name: parameter, dtype: float64

Diferença (manual - linearmodels):
lprbarr    -0.001920
lprbconv   -0.001370
lprbpris    0.000976
lavgsen    -0.000971
lpolpc      0.000478
dtype: float64


# -----------------------------------------------------------------------
# Letra C

In [23]:
# --- Ajuste nomes conforme conforme o df ---
y_name = "lcrmrte"
x_names = ["lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc"]

# df estiver com MultiIndex (county,year), execute df = df.reset_index()
df = df.reset_index() if isinstance(df.index, pd.MultiIndex) else df

# ordenar
df = df.sort_values(["county", "year"]).reset_index(drop=True)

# --- 1) FE within estimator (manual) ---
# calcular médias por county e subtrair (demean)
means = df.groupby("county")[[y_name] + x_names].transform("mean")
df_within = df[[y_name] + x_names] - means

y_within = df_within[y_name].values
X_within = df_within[x_names].values  # sem constante!

fe_ols = sm.OLS(y_within, X_within).fit()
beta_fe = fe_ols.params  # array, ordem = x_names

# cov matrix of FE estimates
V_fe = fe_ols.cov_params()  # (k x k) matrix

In [24]:
sigma_e2

0.021555506626022198

In [25]:
V_fe

array([[ 9.58764355e-04,  3.76106246e-04,  2.50325738e-04,
         3.22427153e-05, -2.48211449e-04],
       [ 3.76106246e-04,  4.08965229e-04,  1.59750697e-04,
         1.69522965e-05, -2.64275688e-04],
       [ 2.50325738e-04,  1.59750697e-04,  9.52846490e-04,
        -2.02880079e-05, -9.60854999e-05],
       [ 3.22427153e-05,  1.69522965e-05, -2.02880079e-05,
         5.84218582e-04,  1.88717176e-05],
       [-2.48211449e-04, -2.64275688e-04, -9.60854999e-05,
         1.88717176e-05,  6.45879677e-04]])

In [26]:
# --- 2) RE (GLS) manual exactly as in previous step ---
# (recompute sigma_e2, sigma_a2, lambda, transform, and OLS on transformed)
# pooled OLS to get residuals for sigma_e^2
X_pooled = sm.add_constant(df[x_names])
y = df[y_name]
pooled = sm.OLS(y, X_pooled).fit()
e = pooled.resid.values

N = df["county"].nunique()
# assume balanced or use modal T; if unbalanced compute mean(T_i) or adapt
T = int(df.groupby("county")["year"].nunique().mode().iloc[0])
K = len(x_names)

# sigma_e^2 per slide (N*(T-1) - K in denom)
df_dof_e = N*(T-1) - K
sigma_e2 = ( ( ( (df_within[y_name].values - fe_ols.predict(X_within))**2 ).sum() ) ) / df_dof_e

# BETWEEN: regress group means
group_means = df.groupby("county")[[y_name] + x_names].mean()
Y_bar = group_means[y_name]
X_bar = group_means[x_names]
Xb = sm.add_constant(X_bar)
be = sm.OLS(Y_bar, Xb).fit()
resid_between = be.resid
df_dof_a = N - (K + 1)
SSR_between = (resid_between**2).sum()
sigma_a2 = SSR_between / df_dof_a - sigma_e2 / T
sigma_a2 = max(0.0, float(sigma_a2))

# lambda
den = sigma_e2 + T * sigma_a2
lam = 0.0 if den <= 0 else 1.0 - np.sqrt(sigma_e2 / den)

# transform data
means_expanded = df.groupby("county")[[y_name] + x_names].transform("mean")
y_re = df[y_name] - lam * means_expanded[y_name]
X_re = df[x_names] - lam * means_expanded[x_names]
X_re_const = sm.add_constant(X_re)  # include constant for RE

re_ols = sm.OLS(y_re, X_re_const).fit()
# take only slope coefficients (drop constant) to compare with FE
beta_re = re_ols.params[x_names].values

V_re_full = re_ols.cov_params()
# covariance matrix of slope parameters (exclude const row/col)
V_re = V_re_full.loc[x_names, x_names].values if hasattr(V_re_full, "loc") else V_re_full[1:,1:]

# --- 3) Preparo para Hausman: use apenas coeficientes presentes em ambos ---
# Here both have same regressors x_names in same order; ensure shapes match
assert beta_fe.shape[0] == beta_re.shape[0] == len(x_names)

# difference
d = beta_fe - beta_re  # vector (k,)

# variance difference V = Var(beta_fe) - Var(beta_re)
# Make sure both cov matrices are numpy arrays in same order
V_fe_arr = V_fe if isinstance(V_fe, np.ndarray) else V_fe.values
V_re_arr = V_re if isinstance(V_re, np.ndarray) else np.asarray(V_re)

V_diff = V_fe_arr - V_re_arr

# --- 4) Compute Hausman statistic H = d' V_diff^{-1} d ---
# invert; if singular use pseudo-inverse and warn
try:
    invV = np.linalg.inv(V_diff)
    use_pinv = False
except np.linalg.LinAlgError:
    invV = np.linalg.pinv(V_diff)
    use_pinv = True

H = abs(float(d.T @ invV @ d))
df_chi = len(d)  # degrees of freedom = number of tested coefficients

# p-value
p_value = stats.chi2.sf(H, df_chi)

# --- 5) Output results ---
print("Hausman test (manual)")
print("=====================")
print(f"Tested regressors: {x_names}")
print(f"k = {df_chi}")
print(f"H statistic = {H:.4f}")
print(f"p-value = {p_value:.6f}")
if use_pinv:
    print("Nota: matriz de variância-diferença singular — foi usado pseudo-inverso (pinv).")
print()
if p_value < 0.05:
    print("Conclusão: rejeitamos H0 ao nível 5% -> há evidência de correlação entre α_i e X_i (usar FE).")
else:
    print("Conclusão: não rejeitamos H0 ao nível 5% -> não há evidência forte de correlação (RE pode ser usado).")


Hausman test (manual)
Tested regressors: ['lprbarr', 'lprbconv', 'lprbpris', 'lavgsen', 'lpolpc']
k = 5
H statistic = 66.8360
p-value = 0.000000

Conclusão: rejeitamos H0 ao nível 5% -> há evidência de correlação entre α_i e X_i (usar FE).


# -----------------------------------------------------------------------
# Letra D

In [30]:
# pip install pandas statsmodels linearmodels scipy

import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels.panel import RandomEffects
from scipy import stats

# --- Ajuste nomes conforme seu df ---
y_name = "lcrmrte"
x_names = ["lprbarr", "lprbconv", "lprbpris", "lavgsen", "lpolpc"]

# --- 0) garantir formatação ---
# MultiIndex (county,year), execute df = df.reset_index()
df = df.reset_index() if isinstance(df.index, pd.MultiIndex) else df

# ordenar (opcional)
df = df.sort_values(["county", "year"]).reset_index(drop=True)

In [31]:
import pandas as pd
import statsmodels.api as sm

# ordena os dados por county e year
df = df.sort_values(["county", "year"])

# cria primeiras diferenças por county
df_fd = df.groupby("county")[["lcrmrte","lprbarr","lprbconv","lprbpris","lavgsen","lpolpc"]].diff()

# remove primeira linha de cada county (que vira NaN)
df_fd = df_fd.dropna()

# variável dependente
y_fd = df_fd["lcrmrte"]

# regressoras nas primeiras diferenças
X_fd = df_fd[["lprbarr","lprbconv","lprbpris","lavgsen","lpolpc"]]

# OLS sem constante (a diferença de um intercepto é zero)
fd_model = sm.OLS(y_fd, X_fd).fit()

print(fd_model.summary())


                                 OLS Regression Results                                
Dep. Variable:                lcrmrte   R-squared (uncentered):                   0.378
Model:                            OLS   Adj. R-squared (uncentered):              0.372
Method:                 Least Squares   F-statistic:                              65.04
Date:                Wed, 03 Dec 2025   Prob (F-statistic):                    5.25e-53
Time:                        22:01:30   Log-Likelihood:                          223.71
No. Observations:                 540   AIC:                                     -437.4
Df Residuals:                     535   BIC:                                     -416.0
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

# -----------------------------------------------------------------------
# Letra E

Vamos refazer a estimação do $\beta_{FE}$, $\beta_{GLS}$ e o Teste de Hausman controlando por novas variáveis.

In [32]:
df = pd.read_stata("cornwell.dta")
df = df.set_index(["county", "year"], drop=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,crmrte,prbarr,prbconv,prbpris,avgsen,polpc,density,taxpc,west,central,...,lpctymle,lpctmin,clcrmrte,clprbarr,clprbcon,clprbpri,clavgsen,clpolpc,cltaxpc,clmix
county,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,81,0.039885,0.289696,0.402062,0.472222,5.61,0.001787,2.307159,25.697630,0,1,...,-2.433870,3.006608,,,,,,,,
1,82,0.038345,0.338111,0.433005,0.506993,5.59,0.001767,2.330254,24.874252,0,1,...,-2.449038,3.006608,-0.039376,0.154542,0.074143,0.071048,-0.003571,-0.011364,-0.032565,0.030857
1,83,0.030305,0.330449,0.525703,0.479705,5.80,0.001836,2.341801,26.451443,0,1,...,-2.464036,3.006608,-0.235316,-0.022922,0.193987,-0.055326,0.036879,0.038413,0.061477,-0.244732
1,84,0.034726,0.362525,0.604706,0.520104,6.89,0.001886,2.346420,26.842348,0,1,...,-2.478925,3.006608,0.136180,0.092641,0.140006,0.080857,0.172213,0.026930,0.014670,-0.027331
1,85,0.036573,0.325395,0.578723,0.497059,6.55,0.001924,2.364896,28.140337,0,1,...,-2.497306,3.006608,0.051825,-0.108054,-0.043918,-0.045320,-0.050606,0.020199,0.047223,0.172125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,83,0.015575,0.226667,0.480392,0.428571,7.77,0.001073,0.869048,18.905853,1,0,...,-2.538060,1.697597,-0.148666,-0.010969,-0.127018,0.164303,0.157158,0.149330,0.070461,0.020250
197,84,0.013662,0.204188,1.410260,0.372727,10.11,0.001109,0.872024,22.704754,1,0,...,-2.548068,1.697597,-0.131037,-0.104441,1.076927,-0.139610,0.263255,0.032795,0.183103,0.026842
197,85,0.013086,0.180556,0.830769,0.333333,5.96,0.001054,0.875000,24.123611,1,0,...,-2.561072,1.697597,-0.043091,-0.123000,-0.529178,-0.111704,-0.528454,-0.050473,0.060617,-0.366374
197,86,0.012874,0.112676,2.250000,0.244444,7.68,0.001088,0.880952,24.981979,1,0,...,-2.580968,1.697597,-0.016311,-0.471524,0.996334,-0.310156,0.253549,0.031580,0.034964,-0.067911


In [33]:
# Vamos nomear as variáveis
y_name = "lcrmrte"
x_names = ["lprbarr","lprbconv","lprbpris","lavgsen","lpolpc","lwcon","lwser"]
id_col = "county"
time_col = "year"

# Verificar se há MultiIndex => transformar
if isinstance(df.index, pd.MultiIndex):
    df = df.reset_index()


# manter só colunas necessárias e remover missings
required = [id_col, time_col, y_name] + x_names
required

df = df[required].dropna().copy()   # dropna para simplicidade
df = df.sort_values([id_col, time_col]).reset_index(drop=True)

In [34]:
# Número de unidades
N = df[id_col].nunique()
T_mode = int(df.groupby(id_col)[time_col].nunique().mode().iloc[0])
K = len(x_names)

print(f"N = {N}, T (mode) = {T_mode}, K = {K}")
print("Observações após dropna:", df.shape[0])

N = 90, T (mode) = 7, K = 7
Observações após dropna: 630


In [35]:
# calcular médias por county e construir demeaned
means = df.groupby(id_col)[[y_name] + x_names].transform("mean")
df_within = df[[y_name] + x_names] - means

df_within

Unnamed: 0,lcrmrte,lprbarr,lprbconv,lprbpris,lavgsen,lpolpc,lwcon,lwser
0,0.112938,-0.110579,-0.232672,-0.012029,-0.110884,-0.032213,-0.124182,-0.141251
1,0.073562,0.043963,-0.158529,0.059018,-0.114456,-0.043577,-0.094249,-0.070384
2,-0.161753,0.021041,0.035458,0.003693,-0.077577,-0.005164,-0.061759,-0.034002
3,-0.025574,0.113682,0.175464,0.084550,0.094636,0.021766,-0.045316,0.015910
4,0.026251,0.005629,0.131546,0.039230,0.044030,0.041965,0.041782,0.049556
...,...,...,...,...,...,...,...,...
625,0.043260,0.208983,-0.712753,0.210483,-0.094545,0.039442,-0.119931,0.022176
626,-0.087777,0.104542,0.364174,0.070873,0.168710,0.072237,-0.014068,0.002720
627,-0.130868,-0.018458,-0.165004,-0.040831,-0.359745,0.021764,0.080398,0.075660
628,-0.147179,-0.489982,0.831330,-0.350987,-0.106196,0.053344,0.183634,0.113590


In [36]:
# montar e rodar OLS sem constante
y_within = df_within[y_name].values
X_within = df_within[x_names].values  # ordem = x_names
fe_ols = sm.OLS(y_within, X_within).fit()

print(fe_ols.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.372
Model:                            OLS   Adj. R-squared (uncentered):              0.365
Method:                 Least Squares   F-statistic:                              52.78
Date:                Wed, 03 Dec 2025   Prob (F-statistic):                    4.51e-59
Time:                        22:01:58   Log-Likelihood:                          372.85
No. Observations:                 630   AIC:                                     -731.7
Df Residuals:                     623   BIC:                                     -700.6
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [37]:
beta_fe = fe_ols.params # Parâmetros(betas estimados)
V_fe = fe_ols.cov_params() #Matriz de covariância dos betas estimados
se_fe = np.sqrt(np.diag(V_fe))

print("\n=== FE (within) ===")
for nm, b, s in zip(x_names, beta_fe, se_fe):
    print(f"{nm:6s}  coef = {b:10.4f}   se = {s:8.4f}")


=== FE (within) ===
lprbarr  coef =    -0.3888   se =   0.0308
lprbconv  coef =    -0.3095   se =   0.0201
lprbpris  coef =    -0.2093   se =   0.0308
lavgsen  coef =     0.0332   se =   0.0240
lpolpc  coef =     0.4197   se =   0.0253
lwcon   coef =    -0.0990   se =   0.0305
lwser   coef =    -0.0146   se =   0.0180


In [38]:
# Estimar sigma_e^2 via residuos within
resid_within = fe_ols.resid # resíduos da regressão
df_dot_e = N*(T_mode - 1) - K
sigma_e2 = (resid_within**2).sum() / df_dof_e
sigma_e2 = float(sigma_e2)
print(f"\nsigma_e^2 (from FE residuals) = {sigma_e2:.6g}")


sigma_e^2 (from FE residuals) = 0.0211086


In [39]:
# BETWEEN estimator (médias por county) para obter sigma_a^2

group_means = df.groupby(id_col)[[y_name] + x_names].mean()
Y_bar = group_means[y_name]
X_bar = group_means[x_names]
Xb = sm.add_constant(X_bar)
be = sm.OLS(Y_bar, Xb).fit()
resid_between = be.resid

df_dof_a = N - (K + 1)   # K slopes + intercept
SSR_between = (resid_between**2).sum()
sigma_a2 = SSR_between / df_dof_a - sigma_e2 / T_mode
sigma_a2 = max(0.0, float(sigma_a2))
print(f"sigma_a^2 (estimated from BETWEEN) = {sigma_a2:.6g}")

sigma_a^2 (estimated from BETWEEN) = 0.0903249


In [40]:
# Cálculo de lambda (λ̂) e transformar (GLS quasi)
den = sigma_e2 + T_mode * sigma_a2
lam = 0.0 if den <= 0 else 1.0 - np.sqrt(sigma_e2 / den)
print(f"lambda_hat = {lam:.6g}")

lambda_hat = 0.82026


In [41]:
means_expanded = df.groupby(id_col)[[y_name] + x_names].transform("mean")
y_re = df[y_name] - lam * means_expanded[y_name]
X_re = df[x_names] - lam * means_expanded[x_names]
X_re_const = sm.add_constant(X_re)

In [42]:
# RE (GLS manual via OLS nas variáveis transformadas)
re_ols = sm.OLS(y_re, X_re_const).fit()
# extrair apenas os coeficientes de slope (mesma ordem x_names)
beta_re = re_ols.params[x_names].values
V_re_full = re_ols.cov_params()
# obter cov matrix das slopes (exclui constante)
if hasattr(V_re_full, "loc"):
    V_re = V_re_full.loc[x_names, x_names].values
else:
    # fallback: assume const is primeira coluna
    V_re = V_re_full[1:,1:]

se_re = np.sqrt(np.diag(V_re))

print("\n=== RE (GLS manual) ===")
for nm, b, s in zip(x_names, beta_re, se_re):
    print(f"{nm:6s}  coef = {b:10.4f}   se = {s:8.4f}")


=== RE (GLS manual) ===
lprbarr  coef =    -0.4526   se =   0.0326
lprbconv  coef =    -0.3494   se =   0.0214
lprbpris  coef =    -0.1985   se =   0.0349
lavgsen  coef =     0.0257   se =   0.0274
lpolpc  coef =     0.4224   se =   0.0269
lwcon   coef =    -0.0727   se =   0.0346
lwser   coef =    -0.0144   se =   0.0205


In [43]:
# Hausman test (FE vs RE) - usando slopes comuns (x_names)
#    H = (b_FE - b_RE)' [Var(b_FE)-Var(b_RE)]^{-1} (b_FE - b_RE)
# ---------------------------
# garantir matrizes numpy
V_fe_arr = np.asarray(V_fe)
V_re_arr = np.asarray(V_re)
# alinhar dimensões (devem ser k x k)
if V_fe_arr.shape != V_re_arr.shape:
    raise ValueError("Mismatch in covariance matrix shapes between FE and RE")


d = beta_fe - beta_re
V_diff = V_fe_arr - V_re_arr

In [27]:
# inverter (usar pseudo-inverso se singular)
try:
    invV = np.linalg.inv(V_diff)
    used_pinv = False
except np.linalg.LinAlgError:
    invV = np.linalg.pinv(V_diff)
    used_pinv = True

H = abs(float(d.T @ invV @ d))
df_chi = len(d)
p_value = stats.chi2.sf(H, df_chi)

print("\n=== Hausman test (FE vs RE) ===")
print(f"k = {df_chi}")
print(f"H = {H:.4f}")
print(f"p-value = {p_value:.6f}")
if used_pinv:
    print("Nota: matriz Var(FE)-Var(RE) sing. => usado pseudo-inverso.")

if p_value < 0.05:
    print("Conclusão: rejeitamos H0 ao nível 5% -> há evidência de correlação entre α_i e X_i (usar FE).")
else:
    print("Conclusão: não rejeitamos H0 ao nível 5% -> não há evidência forte de correlação (RE pode ser usado).")


=== Hausman test (FE vs RE) ===
k = 5
H = 66.8360
p-value = 0.000000
Conclusão: rejeitamos H0 ao nível 5% -> há evidência de correlação entre α_i e X_i (usar FE).
