# Creating the data to test in STATA and Python

In [7]:
import numpy as np
import pandas as pd
from ujive1 import *
from ujive2 import *
from tsls import * 

#Pick a vector length:
n = 1000

#Getting our Z's and making a Z matrix:
Z = np.random.randn(n, 1)
column_of_ones = np.ones((Z.shape[0], 1))
Z = np.hstack((column_of_ones, Z))
#pprint(Z)

#Parameter vectors:
α = np.array([1,1])
β = np.array([1,2])
#pprint(α)
#pprint(β)

#Error terms:
e1 = np.random.normal(0,5,n)
e2 = np.random.normal(0,5,n)
δ = np.random.normal(0,1)
ε = 5*e1 - 5*e2 + δ

#Making our endogenous variable:
x = np.dot(Z,α) + .2*e1
X = np.column_stack((column_of_ones, x))
#pprint(X)

#Outcome vector:
Y = np.dot(X,β) + ε

#OLS benchmark:
bhat_ols = np.dot(np.linalg.inv(np.dot(X.T,X)), np.dot(X.T, Y))

# Check to see if the Z'Z matrix is invertible
if np.linalg.matrix_rank(Z.T @ Z) == Z.shape[1]:  # Should be True
    print("Z'Z is invertible")
else:
    print("Z'Z is not invertible")

cond_number = np.linalg.cond(Z.T @ Z)
print(f"Condition number of Z.T @ Z: {cond_number}")
if cond_number > 1e10:  # Threshold for ill-conditioning
    raise ValueError("Z.T @ Z is ill-conditioned and may cause numerical instability.")


#2sls comparison:
Zt_Z = np.dot(Z.T, Z)
Zt_Z_inv = np.linalg.inv(Zt_Z)
pz = np.dot(np.dot(Z, Zt_Z_inv), Z.T)
proj_x = np.dot(pz, X)
first = np.linalg.inv(np.dot(proj_x.T, X))
second = np.dot(proj_x.T, Y)
bhat_2sls = np.dot(first, second)


ujive1 = UJIVE1(Y,X,Z,talk=False)
ujive2 = UJIVE2(Y,X,Z,talk=False)
#tsls = TSLS(Y,X,Z)

# Combine matrices into a single DataFrame
df = pd.DataFrame({
    "Y": Y,  # Outcome vector
    **{f"X{i}": X[:, i] for i in range(X.shape[1])},  # Endogenous variables
    **{f"Z{i}": Z[:, i] for i in range(Z.shape[1])}   # Instrumental variables
})

# Save the DataFrame to a CSV file
df.to_csv('data.csv', index=False)

#Compare them:
print("OLS:", bhat_ols[1])
print("2SLS:", bhat_2sls[1])
print("Jive 1:", ujive1['beta'])
print(ujive1.summary())
print("Jive 2:",ujive2['beta'])
print(ujive2.summary())

  P = Z @ np.linalg.inv(Z.T @ Z) @ Z.T
  P = Z @ np.linalg.inv(Z.T @ Z) @ Z.T
  P = Z @ np.linalg.inv(Z.T @ Z) @ Z.T
  fit = P @ X #  Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  fit = P @ X #  Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  fit = P @ X #  Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  yfit = X @ beta_jive1
  yfit = X @ beta_jive1
  yfit = X @ beta_jive1
  F = ((np.sum((yfit-ybar)**2)) / (q-1)) / ((e.T @ e)/(N-q))
  F = ((np.sum((yfit-ybar)**2)) / (q-1)) / ((e.T @ e)/(N-q))
  F = ((np.sum((yfit-ybar)**2)) / (q-1)) / ((e.T @ e)/(N-q))
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_F = ((np.sum((fs_fit - xbar) ** 2))/(q_fs-1))/((e_fs.T @ e_fs)/(N-q_fs))
  fs_F = ((np.sum((fs_fit - xbar) ** 2))/(q_fs-1))/((e_fs.T @ e_fs)/(N-q_fs))
  fs_F = ((np.sum((fs_fit - xbar) ** 2))/(q_fs-1))/((e_fs.T @ e_fs)/(N-q_fs))
Normally this estimator is used when Z has more columns than X. In this case Z 

Z'Z is invertible
Condition number of Z.T @ Z: 1.035242579245795
OLS: 14.834920684452495
2SLS: 3.370775391299805
Jive 1: [-1.57784354  3.32739487]

UJIVE1 Regression Results
 Coefficient  Std. Error    t-stat    P>|t|  Conf. Int. Low  Conf. Int. High
   -1.577844    1.589756 -0.992507 0.321191       -4.697491         1.541804
    3.327395    1.066825 -0.992507 0.321191        1.233918         5.420872
--------------------------------------------------------------------------------
R-squared: 0.132188
Adjusted R-squared: 0.131319
F-statistic: 19.201961
Root MSE: 34.905596
None
Jive 2: [-1.57517123  3.32473983]

UJIVE2 Regression Results
 Coefficient  Std. Error    t-stat    P>|t|  Conf. Int. Low  Conf. Int. High
   -1.575171    1.589923 -0.990721 0.322062       -4.695148         1.544805
    3.324740    1.067072 -0.990721 0.322062        1.230777         5.418703
--------------------------------------------------------------------------------
R-squared: 0.132096
Adjusted R-squared: 0.13

  fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X
  leverage = np.diag(Z @ np.linalg.inv(Z.T @ Z) @ Z.T)
  leverage = np.diag(Z @ np.linalg.inv(Z.T @ Z) @ Z.T)
  leverage = np.diag(Z @ np.linalg.inv(Z.T @ Z) @ Z.T)
  yfit = X @ beta_jive2
  yfit = X @ beta_jive2
  yfit = X @ beta_jive2
  F = ((np.sum((yfit - ybar) ** 2)) / (q - 1)) / ((e.T @ e) / (N - q))
  F = ((np.sum((yfit - ybar) ** 2)) / (q - 1)) / ((e.T @ e) / (N - q))
  F = ((np.sum((yfit - ybar) ** 2)) / (q - 1)) / ((e.T @ e) / (N - q))
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_fit = Z @ np.linalg.inv(Z.T @ Z) @ Z.T @ X_fs
  fs_F = ((np.sum((fs_fit - xbar) ** 2)) / (q_fs - 1)) / ((e_fs.T @ e_fs) / (N - q_fs))
  fs_F = ((np.sum((fs_fit - xbar) ** 2)) / (q_fs - 1)) / ((e_fs.T @ e_fs) / (N - q_fs))
  fs_F = ((np.sum((fs_fit - xbar) ** 2)) / (q_fs - 1)) / ((e_fs.T @ e_fs) / (N - q_fs)

In [8]:
(Z @ np.linalg.inv(np.dot(Z.T,Z))) #@ Z.T

array([[ 9.93887380e-04,  7.14128733e-04],
       [ 1.01062506e-03, -1.24131060e-03],
       [ 1.01812446e-03, -2.11745554e-03],
       ...,
       [ 1.00010196e-03, -1.19112685e-05],
       [ 9.97412397e-04,  3.02306068e-04],
       [ 9.95333090e-04,  5.45228540e-04]], shape=(1000, 2))

In [9]:
ZtZ_inv = np.linalg.inv(Z.T @ Z)
print(ZtZ_inv)


[[ 1.00007109e-03 -8.30506095e-06]
 [-8.30506095e-06  9.70268567e-04]]


In [10]:
Z

array([[ 1.        ,  0.74457096],
       [ 1.        , -1.27078789],
       [ 1.        , -2.17378007],
       ...,
       [ 1.        , -0.00371671],
       [ 1.        ,  0.32012902],
       [ 1.        ,  0.57049524]], shape=(1000, 2))

In [11]:
assert np.allclose(Z.T @ Z, (Z.T @ Z).T), "Z'Z is not symmetric."