# Multiple Linear Regression

In [1]:
import numpy as np
import pandas as pd
import polars as pl

df = pd.read_csv('/work/Users/sammy/Documents/Sammy_DS/Multiple Linear Regression/50_Startups.csv')

### Notes
#### Assumptions of Linear Regression:
    - Linearity
    - Homoscedasticity
    - Multivariate Normality
    - No Autocorrelation
    - Lack of Multicolinearity
    - Outlier Check
#### Dummy Variables 
    -(onehotencoding - 1 column)
#### p-Value
    - Null Hypothesis
    - Z-test, T-test
#### 5 ways to Building a Model
    - All-in
    - Backward Elimination
    - Forward Elimination
    - Bi-directional Elimination
    - Score Comparison
Backward, Forward and Birectional are Stepwise Regression Models.
#### Backward Elimination
    - Significance Level
    - Fit model to all predictors
    - Remove predictor with highest p-value and if P > SL
    - Fit the model without the variable
    - Repeat step 3 and 4 till highest p-value for any given variable is < SL
#### Forward Elimination
    - SL
    - Select estimator with lowest p-value
    - Make regression model with all other estimators one by one and compare the p-values
    - Repeat step 2 and add the lowest p-valie variable with the 1st one and repeat step 3 and 4
    - Repeat till lowest p-value > SL
#### Birectional Elimination
    - SL to enter, SL to stay
    - Perform Forward Elimination using SLenter
    - Perform Backward Elimination using SLstay
    - Repeat step 2 and 3 as you keep adding variables in step 2
#### Building a Model: Steps
    - Select goodness of fit like Akaike, r-squared
    - Run all possible regression models 2**N - 1 combinations
    - Select one with best goodness of fit criterion

In [2]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


> >> EDA > Select Model > Preprocessing > Model Building > Model Testing > Model Deployment
>> Impute nan values > one-hot/dummy/label encode categorical vars > train-test split > feature scaling > build model > fit model > check predictions > check goodness of fit

In [4]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [6]:
X

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [8]:
ct = ColumnTransformer([('encoder', OneHotEncoder(), [-1])], remainder = 'passthrough')
ct.fit(X)
X = ct.transform(X)

In [9]:
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

In [11]:
pd.get_dummies(df, columns = ['State'], prefix = 'dummy')

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,dummy_California,dummy_Florida,dummy_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0
5,131876.9,99814.71,362861.36,156991.12,0,0,1
6,134615.46,147198.87,127716.82,156122.51,1,0,0
7,130298.13,145530.06,323876.68,155752.6,0,1,0
8,120542.52,148718.95,311613.29,152211.77,0,0,1
9,123334.88,108679.17,304981.62,149759.96,1,0,0


In [12]:
X[:5]

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

IMPORTANT

- Simple linear regression - feature scaling not needed since we are dealing with a single estimator

-  Multiple linear regression - feature scaling not needed since the coefficients compensate for the same

- We also need not check for the assumptions of the linear regression... if any assumptions are violated, the accuracy of the model will be bad

- The regression class we will be using will take care of the dummy variable trap, we do not need to manually remove a dummy

- The regression class will also check for the best way to select estimators and we do not need to manually pick between backward/forward/bidirectional elimination

- The class also recognises automatically the number of variables fed in and picks between simple or multiple linear models

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [19]:
y_pred = reg.predict(X_test)

In [21]:
np.set_printoptions(precision = 2)

In [22]:
y_pred

array([103015.2 , 132582.28, 132447.74,  71976.1 , 178537.48, 116161.24,
        67851.69,  98791.73, 113969.44, 167921.07])

In [23]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

In [26]:
np.concatenate((y_test.reshape(len(y_test), 1), y_pred.reshape(len(y_pred), 1)), axis = 1)

array([[103282.38, 103015.2 ],
       [144259.4 , 132582.28],
       [146121.95, 132447.74],
       [ 77798.83,  71976.1 ],
       [191050.39, 178537.48],
       [105008.31, 116161.24],
       [ 81229.06,  67851.69],
       [ 97483.56,  98791.73],
       [110352.25, 113969.44],
       [166187.94, 167921.07]])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=57d5770d-d68f-47c5-bc32-65c81e0e368b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>