# Ejemplo de Estimación por Datos Panel

Referencia: Vella and M. Verbeek (1998), “Whose Wages Do Unions Raise? A Dynamic Model of Unionism and Wage Rate Determination for Young Men,” Journal of Applied Econometrics 13, 163-183.

## 1. Dependencias

In [1]:
#!pip install linearmodels==4.24
#!pip install linearmodels==4.5
!pip install linearmodels

Collecting linearmodels
  Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mypy-extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.0.0 (from linearmodels)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting setuptools-scm<9.0.0,>=8.0.0 (from setuptools-scm[toml]<9.0.0,>=8.0.0->linearmodels)
  Downloading setuptools_scm-8.2.0-py3-none-any.whl.metadata (6.8 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.0.0->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel.model import PooledOLS, PanelOLS
from linearmodels.panel import RandomEffects
from linearmodels.panel import compare # Para comparar modelos

#
import warnings
warnings.filterwarnings('ignore')

## 2. Importación de datos

In [3]:
# Importamos el Data Set
data_to_load = 'wage_panel.csv'

# Read the and
wage_df = pd.read_csv(data_to_load)
wage_df.head()

Unnamed: 0,nr,year,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation
0,13,1980,0,1,0,2672,0,14,0,1.19754,1,9
1,13,1981,0,2,0,2320,0,14,1,1.85306,4,9
2,13,1982,0,3,0,2940,0,14,0,1.344462,9,9
3,13,1983,0,4,0,2960,0,14,0,1.433213,16,9
4,13,1984,0,5,0,3071,0,14,0,1.568125,25,5


### Los datos importados son:
* nr: person identifier
* year: 1980 to 1987
* black: =1 if black
* exper: labor market experience
* hisp: =1 if Hispanic
* hours: annual hours worked
* married: =1 if married
* educ: years of schooling
* union: =1 if in union
* lwage: log(wage)
* expersq: exper^2
* occupation: Occupation code

In [4]:
# Adecuaciones al índice para hacerlo Panel:
year = wage_df.year
wage_df = wage_df.set_index(['nr', 'year'])
wage_df#.head()
#indice para indicar la combinacion del panel

Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
13,1980,0,1,0,2672,0,14,0,1.197540,1,9
13,1981,0,2,0,2320,0,14,1,1.853060,4,9
13,1982,0,3,0,2940,0,14,0,1.344462,9,9
13,1983,0,4,0,2960,0,14,0,1.433213,16,9
13,1984,0,5,0,3071,0,14,0,1.568125,25,5
...,...,...,...,...,...,...,...,...,...,...,...
12548,1983,0,8,0,2080,1,9,0,1.591879,64,5
12548,1984,0,9,0,2080,1,9,1,1.212543,81,5
12548,1985,0,10,0,2080,1,9,0,1.765962,100,5
12548,1986,0,11,0,2080,1,9,1,1.745894,121,5


In [5]:
# Agregamos una columna adicional de año (opción 1):
wage_df['year'] = pd.Categorical( year )
wage_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation,year
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13,1980,0,1,0,2672,0,14,0,1.19754,1,9,1980
13,1981,0,2,0,2320,0,14,1,1.85306,4,9,1981
13,1982,0,3,0,2940,0,14,0,1.344462,9,9,1982
13,1983,0,4,0,2960,0,14,0,1.433213,16,9,1983
13,1984,0,5,0,3071,0,14,0,1.568125,25,5,1984


In [6]:
[ wage_df , pd.get_dummies(wage_df['year']) ]

[            black  exper  hisp  hours  married  educ  union     lwage  \
 nr    year                                                              
 13    1980      0      1     0   2672        0    14      0  1.197540   
       1981      0      2     0   2320        0    14      1  1.853060   
       1982      0      3     0   2940        0    14      0  1.344462   
       1983      0      4     0   2960        0    14      0  1.433213   
       1984      0      5     0   3071        0    14      0  1.568125   
 ...           ...    ...   ...    ...      ...   ...    ...       ...   
 12548 1983      0      8     0   2080        1     9      0  1.591879   
       1984      0      9     0   2080        1     9      1  1.212543   
       1985      0     10     0   2080        1     9      0  1.765962   
       1986      0     11     0   2080        1     9      1  1.745894   
       1987      0     12     0   3380        1     9      1  1.466543   
 
             expersq  occupation  ye

In [9]:
# Agregamos columnas de dummies de Year (opción 2):
wage_df = pd.concat( [ wage_df , pd.get_dummies(wage_df['year']) ], axis=1)

In [10]:
# Show data:
wage_df

Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation,...,1986,1987,1980,1981,1982,1983,1984,1985,1986,1987
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
13,1980,0,1,0,2672,0,14,0,1.197540,1,9,...,False,False,True,False,False,False,False,False,False,False
13,1981,0,2,0,2320,0,14,1,1.853060,4,9,...,False,False,False,True,False,False,False,False,False,False
13,1982,0,3,0,2940,0,14,0,1.344462,9,9,...,False,False,False,False,True,False,False,False,False,False
13,1983,0,4,0,2960,0,14,0,1.433213,16,9,...,False,False,False,False,False,True,False,False,False,False
13,1984,0,5,0,3071,0,14,0,1.568125,25,5,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12548,1983,0,8,0,2080,1,9,0,1.591879,64,5,...,False,False,False,False,False,True,False,False,False,False
12548,1984,0,9,0,2080,1,9,1,1.212543,81,5,...,False,False,False,False,False,False,True,False,False,False
12548,1985,0,10,0,2080,1,9,0,1.765962,100,5,...,False,False,False,False,False,False,False,True,False,False
12548,1986,0,11,0,2080,1,9,1,1.745894,121,5,...,True,False,False,False,False,False,False,False,True,False


## 3. Regresión Pooled

In [11]:
# Definición de variables exógeneas y endógena
X = [ 'black','hisp','exper','expersq','married', 'educ', 'union', 'year', 'hours' ]
X = sm.add_constant( wage_df[X] )
X

Unnamed: 0_level_0,Unnamed: 1_level_0,const,black,hisp,exper,expersq,married,educ,union,year,hours
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
13,1980,1.0,0,0,1,1,0,14,0,1980,2672
13,1981,1.0,0,0,2,4,0,14,1,1981,2320
13,1982,1.0,0,0,3,9,0,14,0,1982,2940
13,1983,1.0,0,0,4,16,0,14,0,1983,2960
13,1984,1.0,0,0,5,25,0,14,0,1984,3071
...,...,...,...,...,...,...,...,...,...,...,...
12548,1983,1.0,0,0,8,64,1,9,0,1983,2080
12548,1984,1.0,0,0,9,81,1,9,1,1984,2080
12548,1985,1.0,0,0,10,100,1,9,0,1985,2080
12548,1986,1.0,0,0,11,121,1,9,1,1986,2080


In [12]:
# Definición de variables exógeneas y endógena
X1 = [ 'black','hisp','exper','expersq','married', 'educ', 'union', 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 'hours' ]
X1 = sm.add_constant( wage_df[X1] )
X1

Unnamed: 0_level_0,Unnamed: 1_level_0,const,black,hisp,exper,expersq,married,educ,union,1981,1981,...,1985,1985,1985,1986,1986,1986,1987,1987,1987,hours
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
13,1980,1.0,0,0,1,1,0,14,0,False,False,...,False,False,False,False,False,False,False,False,False,2672
13,1981,1.0,0,0,2,4,0,14,1,True,True,...,False,False,False,False,False,False,False,False,False,2320
13,1982,1.0,0,0,3,9,0,14,0,False,False,...,False,False,False,False,False,False,False,False,False,2940
13,1983,1.0,0,0,4,16,0,14,0,False,False,...,False,False,False,False,False,False,False,False,False,2960
13,1984,1.0,0,0,5,25,0,14,0,False,False,...,False,False,False,False,False,False,False,False,False,3071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12548,1983,1.0,0,0,8,64,1,9,0,False,False,...,False,False,False,False,False,False,False,False,False,2080
12548,1984,1.0,0,0,9,81,1,9,1,False,False,...,False,False,False,False,False,False,False,False,False,2080
12548,1985,1.0,0,0,10,100,1,9,0,False,False,...,True,True,True,False,False,False,False,False,False,2080
12548,1986,1.0,0,0,11,121,1,9,1,False,False,...,False,False,False,True,True,True,False,False,False,2080


In [13]:
#
Y = wage_df[ 'lwage' ]
Y

Unnamed: 0_level_0,Unnamed: 1_level_0,lwage
nr,year,Unnamed: 2_level_1
13,1980,1.197540
13,1981,1.853060
13,1982,1.344462
13,1983,1.433213
13,1984,1.568125
...,...,...
12548,1983,1.591879
12548,1984,1.212543
12548,1985,1.765962
12548,1986,1.745894


In [14]:
# Regresión
model_1 = PooledOLS(Y, X1)
pooled_res_1 = model_1.fit()
print(pooled_res_1)

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [15]:
# Regresión
model = PooledOLS(Y, X)
pooled_res = model.fit()
print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:                  lwage   R-squared:                        0.1952
Estimator:                  PooledOLS   R-squared (Between):              0.2034
No. Observations:                4360   R-squared (Within):               0.1856
Date:                Mon, Mar 10 2025   R-squared (Overall):              0.1952
Time:                        14:41:13   Log-likelihood                   -2966.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      70.221
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(15,4344)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             70.221
                            

## 4. Efectos aleatorios

In [16]:
# Regresión
model = RandomEffects(Y, X)
re_res = model.fit()
print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:                  lwage   R-squared:                        0.1976
Estimator:              RandomEffects   R-squared (Between):              0.1716
No. Observations:                4360   R-squared (Within):               0.2013
Date:                Mon, Mar 10 2025   R-squared (Overall):              0.1854
Time:                        14:41:33   Log-likelihood                   -1569.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      71.315
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(15,4344)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             71.315
                            

In [17]:
# Descomposición de varianza
re_res.variance_decomposition

Unnamed: 0,Variance Decomposition
Effects,0.107522
Residual,0.120095
Percent due to Effects,0.472383


## 5. Efectos fijos

In [18]:
# Regresion
#(Estas líneas generan un error que es intencional..)
model = PanelOLS(Y, X, entity_effects = True)
fe_res = model.fit()
print(fe_res)

AbsorbingEffectError: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.

The following variables or variable combinations have been fully absorbed
or have become perfectly collinear after effects are removed:

          const, black, hisp, exper, educ, union, year.1981, year.1982, year.1983, year.1984, year.1985, year.1986, year.1987
          const, black, hisp, exper, married, educ, union, year.1981, year.1982, year.1983, year.1984, year.1985, year.1986, year.1987
          const, black, hisp, exper, expersq, married, educ, union, year.1981, year.1982, year.1983, year.1984, year.1985, year.1986, year.1987, hours
          const, black, hisp, exper, expersq, married, educ, union, year.1981, year.1982, year.1983, year.1984, year.1985, year.1986, year.1987, hours

Set drop_absorbed=True to automatically drop absorbed variables.


In [19]:
# Regresion con efectos fijos por entidad
# Omitimos: 'exper', 'black','hisp', 'educ'
X = [ 'expersq', 'union', 'married', 'year', 'hours' ]
X = sm.add_constant(wage_df[X])
model = PanelOLS(Y, X, entity_effects = True)
fe_res = model.fit()
print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.2022
Estimator:                   PanelOLS   R-squared (Between):             -0.0726
No. Observations:                4360   R-squared (Within):               0.2022
Date:                Mon, Mar 10 2025   R-squared (Overall):              0.0546
Time:                        14:43:21   Log-likelihood                   -1266.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      87.669
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                 F(11,3804)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             87.669
                            

In [20]:
# Regresion con efectos fijos por entidad y tiempo
# Omitimos: 'exper', 'black','hisp', 'educ', 'year'
X = ['expersq', 'union', 'married', 'hours']
X = sm.add_constant(wage_df[X])
model = PanelOLS(Y, X, entity_effects = True, time_effects = True)
fe_te_res = model.fit()
print(fe_te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:                  lwage   R-squared:                        0.0474
Estimator:                   PanelOLS   R-squared (Between):             -0.0726
No. Observations:                4360   R-squared (Within):              -0.6951
Date:                Mon, Mar 10 2025   R-squared (Overall):             -0.3606
Time:                        14:43:53   Log-likelihood                   -1266.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      47.359
Entities:                         545   P-value                           0.0000
Avg Obs:                       8.0000   Distribution:                  F(4,3804)
Min Obs:                       8.0000                                           
Max Obs:                       8.0000   F-statistic (robust):             47.359
                            

## 6. Comparación de modelos

In [21]:
#
print(compare( { 'Fix efect Ent.': fe_res,
                 'Fix Efect Ent-Time': fe_te_res,
                 'Radom efects': re_res,
                 'Pooled': pooled_res } ))

                                     Model Comparison                                     
                        Fix efect Ent. Fix Efect Ent-Time      Radom efects         Pooled
------------------------------------------------------------------------------------------
Dep. Variable                    lwage              lwage             lwage          lwage
Estimator                     PanelOLS           PanelOLS     RandomEffects      PooledOLS
No. Observations                  4360               4360              4360           4360
Cov. Est.                   Unadjusted         Unadjusted        Unadjusted     Unadjusted
R-squared                       0.2022             0.0474            0.1976         0.1952
R-Squared (Within)              0.2022            -0.6951            0.2013         0.1856
R-Squared (Between)            -0.0726            -0.0726            0.1716         0.2034
R-Squared (Overall)             0.0546            -0.3606            0.1854         0.1952

## 7. Varianza Robusta:

In [None]:
# Rregresión
X = ['expersq', 'union', 'married', 'year', 'hours']
X = sm.add_constant(wage_df[X])
model = PanelOLS(Y, X, entity_effects = True)
#fe_res = model.fit(cov_type = 'robust')
# NOTAS: “unadjusted”, “homoskedastic” - Assume residual are homoskedastic, AND
#       “robust”, “heteroskedastic” - Control for heteroskedasticity using White’s estimator
fe_res = model.fit(cov_type = "clustered", cluster_entity = True)
# NOTAS: clust_entity_time = mod.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

print(fe_res)