In [78]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import bernoulli, binom, zscore
from sklearn.linear_model import LinearRegression, LogisticRegression

In [79]:
np.random.seed(37)
sns.set(color_codes=True)
n = 1_000
X = np.array(np.random.normal(0, np.sqrt(3), n))
e = np.array(np.random.normal(0, 1, n))
y = 5 + 3 * X + e

In [80]:
reg = LinearRegression().fit(X.reshape(-1, 1), y)

In [81]:
reg.coef_

array([2.99777468])

In [82]:
reg.intercept_

4.983964548516604

In [83]:
np.corrcoef(X, y)

array([[1.       , 0.9816779],
       [0.9816779, 1.       ]])

In [84]:
reg.coef_[0] * X.std() / y.std()

0.9816778972902336

In [85]:
np.corrcoef(X, y)[0, 1] * y.std() / X.std()

2.997774678128484

In [86]:
reg_x = LinearRegression().fit(y.reshape(-1, 1), X)

In [87]:
reg_x.intercept_

-1.601386267280085

In [88]:
reg_x.coef_

array([0.32146896])

In [89]:
n = 1_000
I = np.random.normal(30_000, np.sqrt(5_000), n)
A = np.random.normal(45, np.sqrt(5), n)
e = np.random.normal(0, 1, n)
H = 250_000 + 3 * I - 0.5 * A - e

In [90]:
df = pd.DataFrame({'income': I,
                   'age': A,
                   'y': H})

In [91]:
Z = df.apply(zscore)

In [92]:
df.corr()

Unnamed: 0,income,age,y
income,1.0,-0.037504,0.999973
age,-0.037504,1.0,-0.043038
y,0.999973,-0.043038,1.0


In [93]:
m = LinearRegression()
m.fit(df[["income", "age"]], df["y"])

In [94]:
m.coef_, m.intercept_

(array([ 2.99977821, -0.51706961]), 250007.39210775678)

In [95]:
m.fit(Z[["income", "age"]], Z["y"])
m.coef_, m.intercept_

(array([ 0.99976525, -0.00554293]), 1.0625843071916774e-13)

In [96]:
df = pd.DataFrame({
    "y_true": [10, 8, 7, 9, 4],
    "y_pred": [11, 7, 5, 11, 1]
})

In [97]:
df["E"] = df["y_true"] - df["y_pred"]
df["AE"] = np.abs(df["y_true"] - df["y_pred"])
df["SE"] = (df["y_true"] - df["y_pred"]) ** 2
df

Unnamed: 0,y_true,y_pred,E,AE,SE
0,10,11,-1,1,1
1,8,7,1,1,1
2,7,5,2,2,4
3,9,11,-2,2,4
4,4,1,3,3,9


In [98]:
errors = df[["E", "AE", "SE"]].mean()
errors["SE"] = np.sqrt(errors["SE"])
errors.index = ["ME", "MAE", "RMSE"]
errors

ME      0.600000
MAE     1.800000
RMSE    1.949359
dtype: float64

In [99]:
errors

ME      0.600000
MAE     1.800000
RMSE    1.949359
dtype: float64

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     46.27
Date:                Tue, 21 Nov 2023   Prob (F-statistic):           3.83e-62
Time:                        11:33:41   Log-Likelihood:                -2386.0
No. Observations:                 442   AIC:                             4794.
Df Residuals:                     431   BIC:                             4839.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.1335      2.576     59.061      0.0

In [2]:
lm = LinearRegression()
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)

newX = pd.DataFrame({"Constant":np.ones(len(X))}).join(pd.DataFrame(X))
MSE = (sum((y-predictions)**2))/(len(newX)-len(newX.columns))

# Note if you don't want to use a DataFrame replace the two lines above with
# newX = np.append(np.ones((len(X),1)), X, axis=1)
# MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
sd_b = np.sqrt(var_b)
ts_b = params / sd_b

p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-len(newX[0])))) for i in ts_b]

sd_b = np.round(sd_b,3)
ts_b = np.round(ts_b,3)
p_values = np.round(p_values,3)
params = np.round(params,4)

myDF3 = pd.DataFrame()
myDF3["Coefficients"],myDF3["Standard Errors"],myDF3["t values"],myDF3["Probabilities"] = [params,sd_b,ts_b,p_values]
print(myDF3)

    Coefficients  Standard Errors  t values  Probabilities
0       152.1335            2.576    59.061            NaN
1       -10.0099           59.749    -0.168            NaN
2      -239.8156           61.222    -3.917            NaN
3       519.8459           66.533     7.813            NaN
4       324.3846           65.422     4.958            NaN
5      -792.1756          416.680    -1.901            NaN
6       476.7390          339.030     1.406            NaN
7       101.0433          212.531     0.475            NaN
8       177.0632          161.476     1.097            NaN
9       751.2737          171.900     4.370            NaN
10       67.6267           65.984     1.025            NaN
