In [None]:
import numpy as np
import pandas as pd
from typing import Tuple

import statsmodels.api as sm

  import pandas.util.testing as tm


In [None]:
def generateData(n: int, betas: Tuple[float, float], seed: int=1) -> np.array:
  """Generates data according to a linear distribution.

  Data is generated according to X ~ N(0, 1), Y ~ N(\beta_0 + \beta_1*X, 1).

  Args:
    n: number of observations.
    betas: (\beta_0, \beta_1), i.e. the true values of the linear function.
    seed: random seed.

  Returns:
    A numpy array containing the realized values.
  """
  np.random.seed(seed)
  X = np.random.normal(size=n)
  Y = np.random.normal(betas[0] + betas[1]*X)
  P_n = np.stack((X, Y), axis=1)
  return pd.DataFrame(P_n, columns=['X', 'Y'])

In [None]:
P_n = generateData(10, [0, 1])
P_n

Unnamed: 0,X,Y
0,1.624345,3.086453
1,-0.611756,-2.671897
2,-0.528172,-0.850589
3,-1.072969,-1.457023
4,0.865408,1.999177
5,-2.301539,-3.40143
6,1.744812,1.572384
7,-0.761207,-1.639065
8,0.319039,0.361253
9,-0.24937,0.333445


In [None]:
# Model 1
X = sm.add_constant(P_n['X'])
linear_model_1 = sm.OLS(P_n['Y'], X)
fitted_model_1 = linear_model_1.fit()
print(fitted_model_1.summary())

                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.857
Model:                            OLS   Adj. R-squared:                  0.839
Method:                 Least Squares   F-statistic:                     47.89
Date:                Fri, 16 Jul 2021   Prob (F-statistic):           0.000122
Time:                        23:44:45   Log-Likelihood:                -11.372
No. Observations:                  10   AIC:                             26.74
Df Residuals:                       8   BIC:                             27.35
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1162      0.268     -0.434      0.6

  "anyway, n=%i" % int(n))


In [None]:
# Model 2
linear_model_2 = sm.OLS(P_n['Y'], P_n['X'])
fitted_model_2 = linear_model_2.fit()
print(fitted_model_2.summary())

                                 OLS Regression Results                                
Dep. Variable:                      Y   R-squared (uncentered):                   0.856
Model:                            OLS   Adj. R-squared (uncentered):              0.840
Method:                 Least Squares   F-statistic:                              53.53
Date:                Fri, 16 Jul 2021   Prob (F-statistic):                    4.48e-05
Time:                        23:46:56   Log-Likelihood:                         -11.488
No. Observations:                  10   AIC:                                      24.98
Df Residuals:                       9   BIC:                                      25.28
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

  "anyway, n=%i" % int(n))


## Evaluating model performance

In [None]:
P_n_test = generateData(1000, [0, 1])

In [None]:
X_test = sm.add_constant(P_n_test['X'])
y_hat_1 = fitted_model_1.predict(X_test)
mse_1 = ((y_hat_1 - P_n_test['Y'])**2).mean()

In [None]:
y_hat_2 = fitted_model_2.predict(P_n_test['X'])
mse_2 = ((y_hat_2 - P_n_test['Y'])**2).mean()

In [None]:
print('MSE_1 = %0.3f' % mse_1)
print('MSE_2 = %0.3f' % mse_2)

MSE_1 = 1.343
MSE_2 = 1.336
