In [1]:
# importing all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# One hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Train test split set
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

In [2]:
dataset = pd.read_csv("50_Startups.csv")
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

# Assumptions of linear Regression

1.) Linearity : relationship between y and each X
    eg: Distribution is random 

2.) Homoscedasticity : Equal variance

3.) Multivariate Normality : normality of error distribution
    eg: the points are scatter in such way, for example , the points are above and below the plain

4.) Independence (no autocorrelation)
    eg: patterns like a wave

5.) Lack of Multicollinearity: predictors are not correlated with each other
    eg: X1 !~ X2

6.) The outlier check 


In [4]:
dataset.head(1)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83


In [5]:
# Creating a dummy varaible
dataset.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [6]:
dataset['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [7]:
# Dummy variable trap : Do not include all the dummy variable. 

In [8]:
# p - value, significance value (Refer statistics notebook)

# Building the Multi Linear Model.

1.) All-in 
    Using all the variables

2.) Backward elimination
    - i) selct the significance level
    - ii) fit with all possible predictors
    - iii) if p-value > significance
    - iv) remove the predictor
    - v) fit the model without the removed predictor.

3.) Forward selection
    - i) select the significance value
    - ii) fit all and select the lowest p-value
    - iii) keep the variable and fit with one extra predictor
    - iv) consider the predictor with lowest p-value , if P < SL , go to 3
    else Finish
    
4.) bidirectional elimination
    - i) select significance level
    - ii) perform forward selection
    - iii) perform backward elimination
    - iv) no new variables can enter and no old variables can exit

5.) Score Comparisions
    - i) select a criterion of goodness (eg. Akaike citerion)
    - ii) 2^n-1 total combinations and construct all possible regression models
    - iii) select the one with best criterion
    

The fastest one is the backward elimination


In [9]:
# Splitting the dependant and independant variable

X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]
X.head(3)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida


In [10]:
# OneHotEncoding - Categorical Encoding
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

X

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

In [11]:
# Splitting the train and test set

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [12]:
# Dummy variable trap and Linear model assumptions can be ignored for multiple linear regression models

In [13]:
# Training Multiple Linear Model on Training set.

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [17]:
# Predict the test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)


In [19]:
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.values.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


In [None]:
# Building the optimal model using Backward Elimination
import statsmodels.api as sm
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 1, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()X_opt = X[:, [0, 3]]
X_opt = X_opt.astype(np.float64)regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

In [21]:
#Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

print(regressor.predict([[1, 0, 0, 160000, 130000, 300000]]))

[181566.92]


In [22]:
# Getting the final linear regression equation with the values of the coefficients
print(regressor.coef_)
print(regressor.intercept_)


[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]
42467.52924857047


Profit=86.6×Dummy State 1−873×Dummy State 2+786×Dummy State 3+0.773×R&D Spend+0.0329×Administration+0.0366×Marketing Spend+42467.53