# Importing the libraries

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [25]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

print(X.shape, y.shape)
print(X[0:5,:])
print(y[0:5])

(50, 4) (50,)
[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]
[192261.83 191792.06 191050.39 182901.99 166187.94]


# Encoding categorical data

In [26]:
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
onehotencoder = OneHotEncoder()
X1 = X[:,3].reshape(-1, 1)
X1 = onehotencoder.fit_transform(X1).toarray()
X = np.c_[X1, X[:, 0:3]]

print(X1[0:5,:])
print(X[0:5,:])

# labelencoder = LabelEncoder()
# X[:, 3] = labelencoder.fit_transform(X[:, 3])

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]
[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]]


In [27]:
# Avoiding the Dummy Variable Trap
X = X[:, 1:]
print(X[0:5,:])

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]]


# Splitting the dataset into the Training set and Test set

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train[0:5,:])

(40, 5) (10, 5) (40,) (10,)
[[1.0 0.0 55493.95 103057.49 214634.81]
 [0.0 1.0 46014.02 85047.44 205517.64]
 [1.0 0.0 75328.87 144135.98 134050.07]
 [0.0 0.0 46426.07 157693.92 210797.67]
 [1.0 0.0 91749.16 114175.79 294919.57]]


# Fitting Multiple Linear Regression to the Training set

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

# Predicting the Test set results

In [30]:
y_pred = regressor.predict(X_test)
print('coef= ', regressor.coef_)
print('intercept= ', regressor.intercept_)
print('score= ', regressor.score(X_test, y_test))

coef=  [-9.59284160e+02  6.99369053e+02  7.73467193e-01  3.28845975e-02
  3.66100259e-02]
intercept=  42554.167617767
score=  0.9347068473282987


In [31]:
print('score= ', regressor.score(X_train, y_train))

score=  0.9501847627493607


# Building the optimal model using Backward Elimination

In [32]:
import statsmodels.api as sm
# X_train = np.append(arr = np.ones((40, 1)), values = X_train, axis = 1)
X_train = sm.add_constant(X_train)
print(X_train.shape)

X_opt = X_train[:, [0, 1, 2, 3, 4, 5]] # why need to write 0~5
X_opt = np.array(X_opt, dtype='float')

regressor_OLS = sm.OLS(y_train, X_opt).fit()
print(regressor_OLS.summary())

(40, 6)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     129.7
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           3.91e-21
Time:                        06:34:36   Log-Likelihood:                -421.10
No. Observations:                  40   AIC:                             854.2
Df Residuals:                      34   BIC:                             864.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.255e+04   8358.538      5.091 

In [33]:
X_opt = X_train[:, [0, 1, 3, 4, 5]] 
X_opt = np.array(X_opt, dtype='float')
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.944
Method:                 Least Squares   F-statistic:                     166.7
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           2.87e-22
Time:                        06:35:00   Log-Likelihood:                -421.12
No. Observations:                  40   AIC:                             852.2
Df Residuals:                      35   BIC:                             860.7
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.292e+04   8020.397      5.352      0.0

In [34]:
X_opt = X_train[:, [0, 3, 4, 5]] 
X_opt = np.array(X_opt, dtype='float')
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     227.8
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.85e-23
Time:                        06:35:02   Log-Likelihood:                -421.19
No. Observations:                  40   AIC:                             850.4
Df Residuals:                      36   BIC:                             857.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.299e+04   7919.773      5.428      0.0

In [35]:
X_opt = X_train[:, [0, 3, 5]] 
X_opt = np.array(X_opt, dtype='float')
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     349.0
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           9.65e-25
Time:                        06:35:04   Log-Likelihood:                -421.30
No. Observations:                  40   AIC:                             848.6
Df Residuals:                      37   BIC:                             853.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.635e+04   2971.236     15.598      0.0

In [36]:
X_opt = X_train[:, [0, 3]] 
X_opt = np.array(X_opt, dtype='float')
regressor_OLS = sm.OLS(endog = y_train, exog = X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.945
Model:                            OLS   Adj. R-squared:                  0.944
Method:                 Least Squares   F-statistic:                     652.4
Date:                Tue, 08 Mar 2022   Prob (F-statistic):           1.56e-25
Time:                        06:35:06   Log-Likelihood:                -423.09
No. Observations:                  40   AIC:                             850.2
Df Residuals:                      38   BIC:                             853.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.842e+04   2842.717     17.032      0.0

In [37]:
X_train

array([[1.0, 1.0, 0.0, 55493.95, 103057.49, 214634.81],
       [1.0, 0.0, 1.0, 46014.02, 85047.44, 205517.64],
       [1.0, 1.0, 0.0, 75328.87, 144135.98, 134050.07],
       [1.0, 0.0, 0.0, 46426.07, 157693.92, 210797.67],
       [1.0, 1.0, 0.0, 91749.16, 114175.79, 294919.57],
       [1.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [1.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [1.0, 0.0, 1.0, 1000.23, 124153.04, 1903.93],
       [1.0, 0.0, 1.0, 542.05, 51743.15, 0.0],
       [1.0, 0.0, 1.0, 65605.48, 153032.06, 107138.38],
       [1.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 1.0, 0.0, 61994.48, 115641.28, 91131.24],
       [1.0, 0.0, 0.0, 63408.86, 129219.61, 46085.25],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [1.0, 0.0, 0.0, 23640.93, 96189.63, 148001.11],
       [1.0, 0.0, 0.0, 76253.86, 113867.3, 298664.47],
       [1.0, 0.0, 1.0, 15505.73, 127382.3, 35534.17],
       [1.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       