The below code imports a sample data, preprocesses it and develops an MLR by backward elimination

In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import dataset
dataset=pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
dataset.groupby('State').agg([len, sum])

Unnamed: 0_level_0,R&D Spend,R&D Spend,Administration,Administration,Marketing Spend,Marketing Spend,Profit,Profit
Unnamed: 0_level_1,len,sum,len,sum,len,sum,len,sum
State,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
California,17.0,1099180.46,17.0,2052690.62,17.0,3103195.8,17.0,1766387.98
Florida,16.0,1291584.26,16.0,1948302.36,16.0,3957176.82,16.0,1900384.39
New York,17.0,1295316.06,17.0,2066239.0,17.0,3490882.27,17.0,1933859.59


In [3]:
# Let's say we wish to predict Profit
X=dataset.iloc[:,:-1].values
Y=dataset.iloc[:,-1].values
dataset.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [5]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [6]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
# Missing imputation not required
# Char variable (State) encoding
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder=LabelEncoder()
X[:,3]=labelencoder.fit_transform(X[:,3])
onehotencoder=OneHotEncoder(categorical_features=[3])
X=onehotencoder.fit_transform(X).toarray()

In [8]:
X[0]

array([  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
         1.65349200e+05,   1.36897800e+05,   4.71784100e+05])

In [9]:
# Avoid dummy variable trap- i.e, leave one dummy variable after one hot encoding
X=X[:,1:]
# Split data into training and testing
from sklearn.cross_validation import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

In [10]:
#feature scaling isn't required in MLR. Library takes care itself
# Make multiple linear regression model
from sklearn.linear_model import LinearRegression
linearregressor=LinearRegression()
linearregressor.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
# Predicting the test set result
Y_pred=linearregressor.predict(X_test)
# Building the optimal model using backward elimination
import statsmodels.formula.api as sm
#Statsmodels library doesn't account for constant term/intercept in the linear regression automatically, so you need to append column with 1 for 50 rows in data
X=np.append(arr=np.ones((50,1)).astype(int),values=X,axis=1)
# Next we specify all the indexes which are to be kept in the model. We will remove indexes one by one, so list all individually now
X_opt=X[:,[0,1,2,3,4,5]]
# Select significance level of 0.05, remove variables with P value> SL(0.05) one by one
# endog means the dependent variable and exog means the independent variables
regressor_ols=sm.OLS(endog=Y,exog=X_opt).fit()
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Fri, 31 Aug 2018",Prob (F-statistic):,2.9e-28
Time:,18:12:15,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.73e+04,3185.530,8.571,0.000,2.09e+04 3.37e+04
x1,2.73e+04,3185.530,8.571,0.000,2.09e+04 3.37e+04
x2,1091.1075,3377.087,0.323,0.748,-5710.695 7892.910
x3,-39.3434,3309.047,-0.012,0.991,-6704.106 6625.420
x4,0.8609,0.031,27.665,0.000,0.798 0.924
x5,-0.0527,0.050,-1.045,0.301,-0.154 0.049

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,7.08e+17


In [15]:
# In the next step remove the variable with the highest p value, if it's greater than SLS. So remove x3 with p value of 0.991
X_opt=X[:,[0,1,2,4,5]]
# Select significance level of 0.05, remove variables with P value> SL(0.05) one by one
# endog means the dependent variable and exog means the independent variables
regressor_ols=sm.OLS(endog=Y,exog=X_opt).fit()
regressor_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,279.4
Date:,"Fri, 31 Aug 2018",Prob (F-statistic):,1.59e-29
Time:,18:13:43,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1062.0
Df Residuals:,46,BIC:,1069.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.729e+04,3060.993,8.917,0.000,2.11e+04 3.35e+04
x1,2.729e+04,3060.993,8.917,0.000,2.11e+04 3.35e+04
x2,1111.1761,2893.049,0.384,0.703,-4712.230 6934.582
x3,0.8609,0.031,28.127,0.000,0.799 0.922
x4,-0.0527,0.050,-1.057,0.296,-0.153 0.048

0,1,2,3
Omnibus:,14.252,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.188
Skew:,-0.953,Prob(JB):,6.81e-05
Kurtosis:,5.362,Cond. No.,4.5e+17


In [18]:
# In the next step remove the variable with the highest p value, if it's greater than SLS. So remove x2 with p value of 0.703
X_opt=X[:,[0,1,4,5]]
# Select significance level of 0.05, remove variables with P value> SL(0.05) one by one
# endog means the dependent variable and exog means the independent variables
regressor_ols=sm.OLS(endog=Y,exog=X_opt).fit()
regressor_ols.summary()
# Since all the p values are less than SLS. This is our final model.

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,426.8
Date:,"Fri, 31 Aug 2018",Prob (F-statistic):,7.29e-31
Time:,18:23:31,Log-Likelihood:,-526.83
No. Observations:,50,AIC:,1060.0
Df Residuals:,47,BIC:,1065.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,2.744e+04,3008.359,9.122,0.000,2.14e+04 3.35e+04
x1,2.744e+04,3008.359,9.122,0.000,2.14e+04 3.35e+04
x2,0.8621,0.030,28.589,0.000,0.801 0.923
x3,-0.0530,0.049,-1.073,0.289,-0.152 0.046

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.189
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.449
Skew:,-0.961,Prob(JB):,3.63e-05
Kurtosis:,5.474,Cond. No.,3.51e+17


In [27]:
# Backward elimination in an automated way
import statsmodels.formula.api as sm
def backwardElimination(X, sl):
    numVars = len(X[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, X).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    X = np.delete(X, j, 1)
    regressor_OLS.summary()
    return X
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)
