In [4]:
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import statsmodels.api as sm         # good if used to R language

# it is in 5.1.2_quiz.py
n = 100
beta_0 = 5
beta_1 = 2
np.random.seed(1)
# 'n' random variables that are distributed between 0 and 1
x = 10 * ss.uniform.rvs(size=n)
# Add random noise (gaussian) with ss.norm (mean=0, n=100)
y = beta_0 + beta_1 * x + ss.norm.rvs(loc=0, scale = 1, size = n)

mod = sm.OLS(y, x)
est = mod.fit()
# print(est.summary())   #slope is artificially large because y intercept going through origin

In [5]:
# X like x but has constants added to it
X = sm.add_constant(x)
# print(X)
mod = sm.OLS(y, X)
est = mod.fit()
print(est.summary())  #constant: 5.2370   the predictor x1 estimated at: 1.9685
# The intercept 5.2 is the value of the outcome y when all predictors, here just x1, are set to 0.

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                     4115.
Date:                Fri, 19 Nov 2021   Prob (F-statistic):           7.47e-82
Time:                        14:10:31   Log-Likelihood:                -130.72
No. Observations:                 100   AIC:                             265.4
Df Residuals:                      98   BIC:                             270.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.2370      0.174     30.041      0.0

In [None]:
print("""
So how well did we do?
The output summary includes the so-called r-squared statistic,
which is the proportion of variance explained.
And because it's a proportion, it's always between 0 and 1.
But what does variance explained actually mean?
Before we fit our model, we can compute what
is called the total sum of squares, or TSS, which
is defined as the sum of the squared differences between outcome
yi and the mean outcome.
Now after we've created the model, we compute a similar quantity
called the residual sum of squares, RSS, which
is defined as the sum of the squared differences between the outcome yi
and the outcome predicted by the model yi hat.
If the model is useful at all, we would expect
the RSS is to be smaller than the TSS.
The r-squared statistic takes the difference between TSS and RSS,
and then divides that quantity by TSS.
A number near 0, therefore, indicates that the model did not
explain much of the variability in the response or the outcome.
Larger values are better, but what values of r-squared are considered good
always depends on the application context.""")