In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as stm
import matplotlib.pyplot as plt
import math 
import os

## creating random data 

In [8]:
d = { "x1":[2,4,6,7,9,12,15,18,20,24,26],
          "x2":[3,9,4,7,15,12,13,14,12,25,30],
          "x3":[5,8,4,8,14,12,14,11,12,24,22],
          "y":[12,14,16,14,12,17,12,22,25,30,28]}
df = pd.DataFrame(d)
df

Unnamed: 0,x1,x2,x3,y
0,2,3,5,12
1,4,9,8,14
2,6,4,4,16
3,7,7,8,14
4,9,15,14,12
5,12,12,12,17
6,15,13,14,12
7,18,14,11,22
8,20,12,12,25
9,24,25,24,30


## Adding Constant

In [21]:
df = stm.add_constant(df)
df   

Unnamed: 0,const,x1,x2,x3,y
0,1.0,2,3,5,12
1,1.0,4,9,8,14
2,1.0,6,4,4,16
3,1.0,7,7,8,14
4,1.0,9,15,14,12
5,1.0,12,12,12,17
6,1.0,15,13,14,12
7,1.0,18,14,11,22
8,1.0,20,12,12,25
9,1.0,24,25,24,30


In [26]:
predictor = df[["const","x1","x2","x3"]]
target = df[["y"]]


## Calculating Equation through library for now from scratch it is done below file

In [27]:
model = stm.OLS( target, predictor).fit()
model.params

const    10.069263
x1        0.782152
x2        0.245998
x3       -0.418160
dtype: float64

 ## so equation of linear regression is 
 ## y = 10.069263 + 0.782152* x1 + 0.245998* x2 + x3*(-0.418160)

In [29]:
df["y_pred"] = 10.069263 + 0.782152* df["x1"] + 0.245998* df["x2"] + df["x3"]*(-0.418160)

In [30]:
df

Unnamed: 0,const,x1,x2,x3,y,y_pred
0,1.0,2,3,5,12,10.280761
1,1.0,4,9,8,14,12.066573
2,1.0,6,4,4,16,14.073527
3,1.0,7,7,8,14,13.921033
4,1.0,9,15,14,12,14.944361
5,1.0,12,12,12,17,17.389143
6,1.0,15,13,14,12,19.145277
7,1.0,18,14,11,22,22.992211
8,1.0,20,12,12,25,23.646359
9,1.0,24,25,24,30,24.955021


In [32]:
df["y"].mean()

18.363636363636363

## Calculating Residual value from scratch

In [34]:
df["error"] = df["y"] - df["y_pred"]
df["SSE"] = df["error"]*df["error"]
df["SSR"] = df["y_pred"] - df["y"].mean()*df["y_pred"] - df["y"].mean()
df["SST"] = df["SSE"] + df["SSR"]

In [37]:
df

Unnamed: 0,const,x1,x2,x3,y,y_pred,error,SSE,SSR,SST
0,1.0,2,3,5,12,10.280761,1.719239,2.955783,-196.875032,-193.919249
1,1.0,4,9,8,14,12.066573,1.933427,3.73814,-227.883222,-224.145082
2,1.0,6,4,4,16,14.073527,1.926473,3.711298,-262.731242,-259.019943
3,1.0,7,7,8,14,13.921033,0.078967,0.006236,-260.083391,-260.077155
4,1.0,9,15,14,12,14.944361,-2.944361,8.669262,-277.852086,-269.182825
5,1.0,12,12,12,17,17.389143,-0.389143,0.151432,-320.302392,-320.15096
6,1.0,15,13,14,12,19.145277,-7.145277,51.054983,-350.795264,-299.740281
7,1.0,18,14,11,22,22.992211,-0.992211,0.984483,-417.592027,-416.607545
8,1.0,20,12,12,25,23.646359,1.353641,1.832344,-428.950415,-427.118071
9,1.0,24,25,24,30,24.955021,5.044979,25.451813,-451.673546,-426.221733


# R^2 =   = 1 - SSE/SST

In [43]:
Resd = 1 - df["SSE"].sum()/df["SST"].sum()

In [45]:
Resd

1.0273915692357436

## Now calculating values of b0 , b1 , b2 , b3 from scratch 

# ### b = inv(Htrans.H)* Htans*y

In [52]:
X = np.matrix(predictor)
y = np.matrix(target)

In [53]:
print(X.shape)
print(y.shape)

(11, 4)
(11, 1)


In [54]:
tx = np.transpose(X)

In [55]:
br = np.matmul(tx , X)
br

matrix([[  11.,  143.,  144.,  134.],
        [ 143., 2531., 2461., 2188.],
        [ 144., 2461., 2558., 2253.],
        [ 134., 2188., 2253., 2030.]])

In [56]:
inv_br = (np.linalg.inv(br))

In [57]:
br2 = np.matmul(tx , y)

In [58]:
final_matrix = np.matmul(inv_br , br2)

In [59]:
final_matrix

matrix([[10.06926311],
        [ 0.7821525 ],
        [ 0.24599774],
        [-0.41815952]])

## We got same output using both methods 

In [60]:
model = stm.OLS( target, predictor).fit()
model.params

const    10.069263
x1        0.782152
x2        0.245998
x3       -0.418160
dtype: float64