# Understanding:
### When we have multiple input column, we draw a hyperplane as a best fit

In [1]:
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
X,y=make_regression(n_samples=100, n_features=2, n_informative=2, n_targets=1, noise=50)

In [3]:
df=pd.DataFrame({'feature1':X[:,0],'feature2':X[:,1],'target':y})
df.head()

Unnamed: 0,feature1,feature2,target
0,-0.501007,0.082259,55.255826
1,-0.807827,0.616041,29.542894
2,0.491738,1.094367,89.173017
3,-1.43783,-0.45265,-21.040742
4,0.258525,-0.769603,19.773329


In [4]:
fig = px.scatter_3d(df, x='feature1', y='feature2', z='target')

fig.show()

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [7]:
from sklearn.linear_model import LinearRegression

In [9]:
type(X_train)

numpy.ndarray

In [10]:
X_train.shape

(80, 2)

In [14]:
X_test.shape

(20, 2)

In [11]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)


In [15]:
y_pred

array([115.28677129,  28.76292369, 127.4571248 ,  86.21338768,
        41.3987467 , -32.37028733,  30.3981363 , -57.74926425,
         8.73248025, -94.74580223,  58.32020037,  56.84889178,
       110.49125051, -16.78789567,   6.77042539, -62.14435761,
        99.38551164, -39.96373537, -25.36008239, -50.53376811])

In [16]:
print("MAE",mean_absolute_error(y_test,y_pred))
print("MSE",mean_squared_error(y_test,y_pred))
print("R2 score",r2_score(y_test,y_pred))

MAE 31.14654022606254
MSE 1455.8960075219186
R2 score 0.7634947401709911


In [21]:
x = np.linspace(-5, 5, 10)
y = np.linspace(-5, 5, 10)
xGrid, yGrid = np.meshgrid(y, x)
final = np.vstack((xGrid.ravel().reshape(1,100), yGrid.ravel().reshape(1,100))).T
z_final = lr.predict(final).reshape(10, 10)
z = z_final


In [22]:
fig = px.scatter_3d(df, x='feature1', y='feature2', z='target')

fig.add_trace(go.Surface(x = x, y = y, z =z ))

fig.show()

In [None]:
lr.coef_

array([96.71919406, 59.03974991])

In [None]:
lr.intercept_

np.float64(-3.1361402688130298)

# Using scikitlearn linear regression class

In [27]:
from sklearn.datasets import load_diabetes
X,y = load_diabetes(return_X_y=True)

In [28]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [29]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [30]:
print(X.shape)
print(y.shape)

(442, 10)
(442,)


In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [32]:
reg=LinearRegression()

In [33]:
reg.fit(X_train,y_train)

In [34]:
y_pred=reg.predict(X_test)

In [35]:
y_pred

array([154.1213881 , 204.81835118, 124.93755353, 106.08950893,
       258.5348576 , 256.3310074 , 118.75087616, 119.52440696,
       101.50816735, 190.54048661, 141.70656811, 172.51883961,
       174.33861649, 134.80942706, 294.13994537,  94.11798038,
       211.97059795, 156.49579378, 134.21000428, 119.62664644,
       148.87842251, 165.00873409, 151.10021038, 176.04063756,
       133.27769647, 221.29555392, 197.17324941,  96.1577688 ,
        50.26012711, 230.48580317, 242.06073866, 114.11129218,
        67.07532417,  94.52943825, 201.21415375, 167.05136201,
       159.881268  , 192.78746659, 114.49551325, 233.48234551,
       140.82563045, 121.0680409 , 192.27480772, 191.12738845,
       179.16865788, 148.34935601, 163.47414622, 276.81647884,
       100.17926432, 164.10555298, 255.80762189, 136.9466204 ,
       152.37503699, 107.92237882, 194.21924678,  77.34670792,
       118.50482479,  68.38335763, 154.29258529, 162.48840259,
       168.36788326, 156.87790322,  97.14191797, 238.16

In [36]:
r2_score(y_test,y_pred)

0.4399338661568968

In [58]:
reg.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [37]:
reg.intercept_

np.float64(151.88331005254167)

# Code from scratch

In [43]:
from sklearn.datasets import load_diabetes
X,y = load_diabetes(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [59]:
class multiple_linear_regression:

  def __init__(self):
    self.coef_=None
    self.intercept_=None

  def fit(self,X_train,y_train):

    X_train=np.insert(X_train,0,1,axis=1)
    beta=np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
    self.coef_=beta[1:]
    self.intercept_=beta[0]

  def predict(self,X_test):
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred


In [60]:
mylr=multiple_linear_regression()
mylr.fit(X_train,y_train)

In [61]:
X_train.shape

(353, 10)

In [62]:
y_pred = mylr.predict(X_test)

In [63]:
y_pred.shape

(89,)

In [64]:
y_test.shape

(89,)

In [65]:
r2_score(y_test,y_pred)

0.43993386615689634

In [66]:
mylr.coef_

array([  -9.15865318, -205.45432163,  516.69374454,  340.61999905,
       -895.5520019 ,  561.22067904,  153.89310954,  126.73139688,
        861.12700152,   52.42112238])

In [67]:
mylr.intercept_

np.float64(151.88331005254165)