In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)

In [7]:
X = X[:, 1:]  # avoiding dummy variable trap

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
y_pred = reg.predict(X_test)

In [16]:
# building the optimal model using backward elimination

import statsmodels.formula.api as sm
X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
X_opt = X[: , [0, 1, 2, 3, 4, 5]]
reg_ols = sm.OLS(endog=y, exog=X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sat, 17 Aug 2019",Prob (F-statistic):,2.76e-29
Time:,10:11:22,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.625e+04,1013.373,16.036,0.000,1.42e+04,1.83e+04
x1,1.625e+04,1013.373,16.036,0.000,1.42e+04,1.83e+04
x2,1.625e+04,1013.373,16.036,0.000,1.42e+04,1.83e+04
x3,1163.8663,3379.767,0.344,0.732,-5639.252,7966.985
x4,9.5969,3312.045,0.003,0.998,-6657.204,6676.398
x5,0.8530,0.030,28.226,0.000,0.792,0.914

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,3.86e+21


In [17]:
X_opt = X[: , [0, 1, 2, 4, 5]]
reg_ols = sm.OLS(endog=y, exog=X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,416.4
Date:,"Sat, 17 Aug 2019",Prob (F-statistic):,1.2599999999999999e-30
Time:,10:13:34,Log-Likelihood:,-527.42
No. Observations:,50,AIC:,1061.0
Df Residuals:,47,BIC:,1067.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.64e+04,903.607,18.152,0.000,1.46e+04,1.82e+04
x1,1.64e+04,903.607,18.152,0.000,1.46e+04,1.82e+04
x2,1.64e+04,903.607,18.152,0.000,1.46e+04,1.82e+04
x3,-560.3051,2841.908,-0.197,0.845,-6277.486,5156.876
x4,0.8545,0.030,28.843,0.000,0.795,0.914

0,1,2,3
Omnibus:,14.03,Durbin-Watson:,1.127
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.472
Skew:,-0.916,Prob(JB):,5.91e-05
Kurtosis:,5.448,Cond. No.,2.44e+21


In [18]:
X_opt = X[: , [0, 1, 2, 5]]
reg_ols = sm.OLS(endog=y, exog=X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sat, 17 Aug 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,10:14:17,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.634e+04,845.966,19.320,0.000,1.46e+04,1.8e+04
x1,1.634e+04,845.966,19.320,0.000,1.46e+04,1.8e+04
x2,1.634e+04,845.966,19.320,0.000,1.46e+04,1.8e+04
x3,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,1.08e+21
