# Mutiple Linear Regression Using Python

## Import Necessary Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing dataset

In [2]:
dataset=pd.read_csv("50_Startups.csv")
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,4].values

## Encoding Categorical Variables

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder_X=LabelEncoder()
X[:,3]=label_encoder_X.fit_transform(X[:,3])
onehotencoder=OneHotEncoder(categorical_features=[3])
X=onehotencoder.fit_transform(X).toarray()
X=np.array(X,dtype=int)

## Avoid dummy variable trap

In [5]:
X=X[:,1:]

## Splitting the dataset into Training set and Test set

In [6]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)



## Fitting Multiple Linear Regression model to the training set

In [7]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Predicting the Test set result

In [8]:
y_pred=regressor.predict(X_test)
y_pred

array([ 103015.24646784,  132581.94062687,  132448.09397395,
         71975.74395634,  178537.52007852,  116161.05196902,
         67851.47761322,   98791.74112204,  113969.41004647,
        167921.22416078])

## Building the optimal model using Backward Elimination 

In [9]:
#Backward Elimination below is purely based on p value. but it can be done based on R square and Adjusted R square value
import statsmodels.formula.api as sm
X=np.append(arr=np.ones((50,1)).astype(int),values=X,axis=1)
X_opt=X[:,[0,1,2,3,4,5]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 18 Jan 2018",Prob (F-statistic):,1.34e-27
Time:,16:13:47,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.013e+04,6884.855,7.281,0.000,3.63e+04 6.4e+04
x1,198.7542,3371.026,0.059,0.953,-6595.103 6992.611
x2,-42.0063,3256.058,-0.013,0.990,-6604.161 6520.148
x3,0.8060,0.046,17.368,0.000,0.712 0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132 0.078
x5,0.0270,0.017,1.574,0.123,-0.008 0.062

0,1,2,3
Omnibus:,14.783,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.267
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [10]:
X_opt=X[:,[0,1,3,4,5]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 18 Jan 2018",Prob (F-statistic):,8.49e-29
Time:,16:13:48,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.011e+04,6647.901,7.537,0.000,3.67e+04 6.35e+04
x1,220.1847,2900.553,0.076,0.940,-5621.828 6062.197
x2,0.8060,0.046,17.606,0.000,0.714 0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131 0.077
x4,0.0270,0.017,1.592,0.118,-0.007 0.061

0,1,2,3
Omnibus:,14.759,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.173
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [11]:
X_opt=X[:,[0,3,4,5]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 18 Jan 2018",Prob (F-statistic):,4.53e-30
Time:,16:13:49,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.012e+04,6572.384,7.626,0.000,3.69e+04 6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715 0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130 0.076
x3,0.0272,0.016,1.655,0.105,-0.006 0.060

0,1,2,3
Omnibus:,14.839,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.443
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.587,Cond. No.,1400000.0


In [12]:
X_opt=X[:,[0,3,5]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Thu, 18 Jan 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:13:50,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.698e+04,2689.941,17.464,0.000,4.16e+04 5.24e+04
x1,0.7966,0.041,19.265,0.000,0.713 0.880
x2,0.0299,0.016,1.927,0.060,-0.001 0.061

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.162
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [13]:
X_opt=X[:,[0,3]]
regressor_OLS=sm.OLS(endog=y,exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 18 Jan 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:13:52,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.903e+04,2537.900,19.320,0.000,4.39e+04 5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795 0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.538
Skew:,-0.911,Prob(JB):,9.43e-05
Kurtosis:,5.361,Cond. No.,165000.0
