# Some notes on regression statistics

In [15]:
import numpy as np
import pandas as pd
from scipy.stats import t
import statsmodels.api as sm
import matplotlib.pyplot as plt

# generate random regression data
N=1000
M=5
X = np.random.ranf((N,M))
B = np.random.ranf(M)
eps = np.random.normal(size=N) * 10
Y = X @ B + eps

In [14]:
# Estimate betas
reg = pd.DataFrame()
B_hat = np.linalg.inv(X.T @ X) @ X.T @ Y
reg['Beta hat'] = B_hat

# get residuals
eps_hat = Y - X @ B_hat

# Get R2
TSS = Y.T @ Y # total SS
RSS = eps_hat.T @ eps_hat.T # residual SS
R2 = 1-RSS/TSS
print(f'R2: [{R2:.3f}]')

# Get SE per beta
# 1: VAR(AY) = A VAR(Y) A^T
# 2: B_hat = (X^T X)^{-1} X^T Y
# substituting U = (X^T X)^{-1} X^T => VAR(B_hat) = VAR(U Y) = U VAR(Y) U^T
U = np.linalg.inv(X.T @ X) @ X.T
E_HAT = Y - X @ B_hat
VAR_E = np.sum(E_HAT.T @ E_HAT)/(N-M) - (np.sum(E_HAT)/(N-M))**2
SE_beta = (U @ (np.identity(N) * VAR_E) @ U.T).diagonal() ** 0.5
reg['Beta SEs'] = SE_beta

# Get t-stat per beta
t_stats = B_hat / SE_beta
reg['t-stats'] = t_stats

# Get p-stat per beta
p_null = 1 - 2 * np.abs(t.cdf(t_stats, N-M) - 0.5)  # 1 - prob of being further from center
reg['p-vals'] = p_null

print(reg)

R2: [0.015]
   Beta hat  Beta SEs   t-stats    p-vals
0  1.022937  1.023936  0.999025  0.318025
1  0.487853  1.020993  0.477822  0.632882
2  0.447468  1.013443  0.441533  0.658923
3 -0.936951  0.997309 -0.939478  0.347713
4  1.223707  1.005457  1.217066  0.223868


In [9]:
# Get log-likelihood
# Get BIC

print(sm.OLS(Y, X).fit().summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.015
Model:                            OLS   Adj. R-squared (uncentered):              0.010
Method:                 Least Squares   F-statistic:                              2.963
Date:                Sat, 31 Jul 2021   Prob (F-statistic):                      0.0116
Time:                        17:16:52   Log-Likelihood:                         -3748.7
No. Observations:                1000   AIC:                                      7507.
Df Residuals:                     995   BIC:                                      7532.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------