In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from numpy import linalg as la
from scipy.stats import chi2
from tabulate import tabulate
import pandas as pd
from io import StringIO
from tabulate import tabulate
from matplotlib import pyplot as plt

#Suppress Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import this weeks LinearModels.py file
import EstimatesandTest as lm
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
y, x, T, year, label_y, label_x = lm.load_example_data()

### Part 1: Pooled OLS not used in assignment but nice to have


In [3]:
# First, regress y on x without any transformations. Store the resulting dictionary.
# Tip: If you want robust standard errors, you can add the argument robust_se=True to the estimate function.
pols_result = lm.estimate(y, x, T=T)

# Then, print the resulting dictionary using the provided print_table() function. The labels should have been provided to you.
lm.print_table((label_y, label_x), pols_result, title="Pooled OLS", floatfmt='.4f')

Pooled OLS
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Constant     0.0000  0.0050      0.0000
Log labour   0.6748  0.0102     66.4625
Log capital  0.3100  0.0091     33.9237
R² = 0.914
σ² = 0.131


In [4]:
pols_result_robust = lm.estimate(y, x, T=T, robust_se=True)
lm.print_table((label_y, label_x), pols_result_robust, title="Pooled OLS (Robust SE)", floatfmt='.4f')

Pooled OLS (Robust SE)
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Constant     0.0000  0.0161      0.0000
Log labour   0.6748  0.0366     18.4526
Log capital  0.3100  0.0324      9.5810
R² = 0.914
σ² = 0.131


### Fixed effects

In [5]:
def remove_zero_columns(x, label_x, tol=1e-10):
    """
    Drop columns that are (numerically) all zeros and keep their labels aligned.

    Args:
        x: regressor matrix.
        label_x: list of column labels.
        tol: tolerance for treating entries as zero.

    Returns:
        Filtered matrix and matching labels.
    """
    mask = ~np.all(np.isclose(x, 0.0, atol=tol), axis=0)
    x_nonzero = x[:, mask]
    label_nonzero = [lbl for lbl, keep in zip(label_x, mask) if keep]
    return x_nonzero, label_nonzero


In [6]:
# Transform the data
Q_T = np.eye(T) - 1/T * np.ones((T, T))
y_dot = lm.perm(Q_T, y)
x_dot = lm.perm(Q_T, x)

# Remove the columns that are only zeroes
x_dot, label_x_dot = remove_zero_columns(x_dot, label_x)

# Estimate 
fe_result = lm.estimate(y_dot, x_dot, transform='fe', T=T, robust_se=True)
lm.print_table((label_y, label_x_dot), fe_result, title="Fixed Effects", floatfmt='.4f')

Fixed Effects
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Log labour   0.6942  0.0417     16.6674
Log capital  0.1546  0.0299      5.1630
R² = 0.477
σ² = 0.018


### First differences part one we go back to this in tests

In [7]:
# Transform the data
D_T = - np.eye(T-1, T) + np.eye(T-1, T, k=1)
y_diff = lm.perm(D_T, y)
x_diff = lm.perm(D_T, x)

# Remove the columns that are only zeroes
x_diff, label_x_diff = remove_zero_columns(x_diff, label_x)

# Estimate 
fd_result = lm.estimate(y_diff, x_diff, transform='fd', T=T-1, robust_se=True)
lm.print_table((label_y, label_x_diff), fd_result, title="First Difference", floatfmt='.4f')

First Difference
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Log labour   0.5487  0.0292     18.8191
Log capital  0.0630  0.0232      2.7097
R² = 0.165
σ² = 0.014


### Lets go to the fun part: Random effects pre set up

In [8]:
# Transform the data
P_T = np.ones((1,T)) * 1/T
y_mean = lm.perm(P_T, y)
x_mean = lm.perm(P_T, x)

# Estimate 
be_result = lm.estimate(y_mean, x_mean, transform='be', T=T)
lm.print_table((label_y, label_x), be_result, title="Between Estimator", floatfmt='.4f')

Between Estimator
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Constant     0.0000  0.0161      0.0000
Log labour   0.6672  0.0343     19.4572
Log capital  0.3188  0.0309     10.3230
R² = 0.923
σ² = 0.115


In [9]:
# Calculate lambda (note lambda is a reserved keyword in Python, so we use _lambda instead)
sigma2_u = fe_result['sigma2']
sigma2_w = be_result['sigma2']
sigma2_c = sigma2_w - 1/T * sigma2_u
_lambda = 1 - np.sqrt(sigma2_u / (sigma2_u + T*sigma2_c))

# Print lambda 
print(f'Lambda is approximately equal to {_lambda.item():.4f}.')

Lambda is approximately equal to 0.8873.


### Randoms effects in action

In [10]:
# Transform the data
P_T_full = np.ones((T, T)) / T
C_T = np.eye(T) - _lambda * P_T_full
y_re = lm.perm(C_T, y)
x_re = lm.perm(C_T, x)

# Estimate 
re_result = lm.estimate(y_re, x_re, transform='re', T=T, robust_se=True)
lm.print_table((label_y, label_x), re_result, title="Random Effects", floatfmt='.4f')

Random Effects
Dependent variable: Log deflated sales

               Beta      Se    t-values
-----------  ------  ------  ----------
Constant     0.0000  0.0168      0.0000
Log labour   0.7197  0.0335     21.4637
Log capital  0.1989  0.0261      7.6174
R² = 0.642
σ² = 0.018


### CRS testing

In [11]:
# Constant-returns-to-scale Wald tests for FE, FD, and RE

def crs_wald(result, skip=0):
    R = np.array([[1.0, 1.0]])
    q = np.array([[1.0]])
    b = result['b_hat'][skip:, :]
    cov = result['cov'][skip:, skip:]
    diff = R @ b - q
    var_rb = R @ cov @ R.T
    stat = float(diff.T @ la.inv(var_rb) @ diff)
    crit = chi2.ppf(0.95, 1)
    pval = 1 - chi2.cdf(stat, 1)
    return stat, crit, pval

W_fe, crit_fe, p_fe = crs_wald(fe_result, skip=0)
print(f'CRS Wald test (FE): {W_fe:.4f}')
print(f'Critical value (5%): {crit_fe:.4f}')
print(f'p-value: {p_fe:.4f}')

W_fd, crit_fd, p_fd = crs_wald(fd_result, skip=0)
print(f'CRS Wald test (FD): {W_fd:.4f}')
print(f'Critical value (5%): {crit_fd:.4f}')
print(f'p-value: {p_fd:.4f}')

W_re, crit_re, p_re = crs_wald(re_result, skip=1)
print(f'CRS Wald test (RE): {W_re:.4f}')
print(f'Critical value (5%): {crit_re:.4f}')
print(f'p-value: {p_re:.4f}')


CRS Wald test (FE): 19.4029
Critical value (5%): 3.8415
p-value: 0.0000
CRS Wald test (FD): 150.0280
Critical value (5%): 3.8415
p-value: 0.0000
CRS Wald test (RE): 18.6793
Critical value (5%): 3.8415
p-value: 0.0000


  stat = float(diff.T @ la.inv(var_rb) @ diff)


### Strict exogeneity testing

In [12]:
# Strict exogeneity test for FE using a lead of log labour
F_T = np.eye(T, k=1)[:-1]
labour_lead = lm.perm(F_T, x[:, 1].reshape(-1, 1))

I_T = np.eye(T)[:-1]
x_exo = lm.perm(I_T, x)
y_exo = lm.perm(I_T, y)

x_exo = np.hstack((x_exo, labour_lead))

Q_T_exo = np.eye(T - 1) - 1/(T - 1) * np.ones((T - 1, T - 1))
y_exo_w = lm.perm(Q_T_exo, y_exo)
x_exo_w = lm.perm(Q_T_exo, x_exo)

labels_exo = label_x + ['Lead log labour']
x_exo_w, labels_exo = remove_zero_columns(x_exo_w, labels_exo)

fe_exo_result = lm.estimate(y_exo_w, x_exo_w, transform='fe', T=T-1, robust_se=True)
lm.print_table((label_y, labels_exo), fe_exo_result, title='Strict Exogeneity Test (FE)', floatfmt='.4f')

lead_beta = fe_exo_result['b_hat'][-1, 0]
lead_se = fe_exo_result['se'][-1, 0]
wald_lead = (lead_beta / lead_se) ** 2
crit_lead = chi2.ppf(0.95, 1)
p_lead = 1 - chi2.cdf(wald_lead, 1)
print(f'Wald test H0: lead coefficient = 0 -> {wald_lead:.4f} (crit 5% = {crit_lead:.4f}, p = {p_lead:.4f})')

if p_lead < 0.05:
    print('-> Reject H0: lead term is zero (evidence against strict exogeneity).')
else:
    print('-> Do NOT reject H0: no evidence against strict exogeneity in FE panel.')


Strict Exogeneity Test (FE)
Dependent variable: Log deflated sales

                   Beta      Se    t-values
---------------  ------  ------  ----------
Log labour       0.5681  0.0397     14.3113
Log capital      0.1495  0.0291      5.1287
Lead log labour  0.1532  0.0281      5.4442
R² = 0.473
σ² = 0.016
Wald test H0: lead coefficient = 0 -> 29.6395 (crit 5% = 3.8415, p = 0.0000)
-> Reject H0: lead term is zero (evidence against strict exogeneity).


#### Question 4: Comparing FE and RE (Hausman Testing)

In [13]:
# Hausman test using homoskedastic covariance matrices
fe_nr = lm.estimate(y_dot, x_dot, transform='fe', T=T, robust_se=False)
re_nr = lm.estimate(y_re, x_re, transform='re', T=T, robust_se=False)

# Use only the time-varying regressors
b_fe = fe_nr['b_hat']
b_re = re_nr['b_hat'][1:, :]
cov_fe = fe_nr['cov']
cov_re = re_nr['cov'][1:, 1:]

# Calculate the test statistic
b_diff = b_fe - b_re
cov_diff = cov_fe - cov_re
H = b_diff.T @ la.inv(cov_diff) @ b_diff

# 5% chi-square critical value with M degrees of freedom
M = len(b_diff)
crit_val = chi2.ppf(0.95, M)
p_val = 1 - chi2.cdf(H.item(), M)

print(f"Hausman test statistic: {H.item():.2f}")
print(f"Critical value (5%): {crit_val:.2f}")
print(f"p-value: {p_val:.8f}")
if p_val < 0.05:
    print('-> Reject H0: FE and RE differ (Hausman favors FE).')
else:
    print('-> Do NOT reject H0: no evidence against RE consistency (Hausman).')


Hausman test statistic: 73.54
Critical value (5%): 5.99
p-value: 0.00000000
-> Reject H0: FE and RE differ (Hausman favors FE).


This violates RE.1b

### Sequential rationality testing

In [14]:
import numpy as np
from numpy import linalg as la
import pandas as pd
from io import StringIO
from tabulate import tabulate
from matplotlib import pyplot as plt

#Supress Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import EstimatesandTest as lm
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# First, import the data into numpy. 
data = pd.read_csv('firms.csv')
id_array = np.array(data.iloc[:, 0])

# Count how many persons we have. This returns a tuple with the unique IDs,
# and the number of times each person is observed.
unique_id = np.unique(id_array, return_counts=True)
N = unique_id[0].size
T = int(unique_id[1].mean())
year = np.array(data.iloc[:, 1], dtype=int)

# Load the rest of the data into arrays.
y = data['lcap'].to_numpy().reshape(-1, 1)

# x needs to have a constant vector in the first row. How would you add this? 
# Note that the order is set to match the order of variables in the model.
x = np.column_stack([
    np.ones(N * T),
    data['lemp'].to_numpy(),
    data['ldsa'].to_numpy()
])

# Lets also make some variable names
label_y = 'Log capital'
label_x = [
    'Constant',
    'Log employment',
    'Log DSA'
]

In [16]:
from EstimatesandTest import fd_exogeneity_lead_test

# FD test in log differences (assignment style)
fd_exogeneity_lead_test(y, x, N, T, cap_col=1, emp_col=2, logs=True, drop_zeros=True)


FD Exogeneity Test (lead-variable)
Model: Δ log y_it = b1 Δ log K_it + b2 Δ log L_it + b3 Δ log L_{i,t+1} + Δu_it

              Variable        Beta          SE         t
                 const     -0.0000      0.0035     -0.01
          Δlog Capital      0.0753      0.0345      2.18
       Δlog Employment     -0.0015      0.0241     -0.06
  Lead Δlog Employment      0.0338      0.0192      1.76

Test H0: b3 (Lead term) = 0 → t=1.76, p=0.08036 (df clustered=141)
→ Do NOT reject exogeneity in FD.
