In [4]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [5]:
#Importing dataset
df = pd.read_csv("data/50_Startups.csv")
X = df.iloc[:,:-1]
y = df.iloc[:,4]

In [6]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
#check if there is any null values or not using below command. If yes, then use Imputer class to handle missing values.
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [8]:
#check for any categorical variable (if any)
#Encoding categorical data in this case (independent variable)
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X.iloc[:,3] = labelencoder_X.fit_transform(X.iloc[:,3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

In [9]:
#Alert! Avoid the dummy variable trap by removing any one dummy variable
X = X[:, 1:]

In [11]:
#no feature scaling is reuqired as Multiple Linear Refression algorithm take care by itself

In [30]:
# **** Build a model using Backward Elimination ****

#import required libraries
import statsmodels.formula.api as sm

#add a new column with np.ones as statsmodel library does not consider intercept variable
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)

In [53]:
def backwardElimination(x, y, sl):
    for i in range(0, len(x[0])):
        reg_OLS = sm.OLS(endog=y, exog=x).fit()
        maxVal = max(reg_OLS.pvalues).astype(float)
        if(maxVal > sl):
            for j in range(0,len(reg_OLS.pvalues)):
                if(maxVal == reg_OLS.pvalues[j].astype(float)):
                    x = np.delete(x,j,1)
    return x

In [55]:
SL = 0.05
X_opt = backwardElimination(X, y, SL)
sm.OLS(endog=y, exog=x_opt).fit().summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 22 Dec 2019",Prob (F-statistic):,3.5000000000000004e-32
Time:,19:13:44,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04 5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795 0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [29]:
#finally we got the model as R&D variable is the important variable that should be add according to Backward Elimination method (automatic)

In [56]:
#split data into train and test dataset
from sklearn.cross_validation import train_test_split  #(for python2)
#from sklearn.model_selection import train_test_split  (for python3)
X_train, X_test, y_train, y_test = train_test_split(X_opt,y, test_size=0.2, random_state=0)

In [57]:
#fit data into the model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)

In [59]:
#display original profit values, predicted profit values and residual values
y_compare = pd.DataFrame(
    {'Original Profit': y_test,
     'Predicted Profit': y_pred,
     'Residual Error' : y_test-y_pred
    }).reset_index().drop('index',axis=1)
y_compare

Unnamed: 0,Original Profit,Predicted Profit,Residual Error
0,103282.38,104667.27806,-1384.89806
1,144259.4,134150.834106,10108.565894
2,146121.95,135207.800195,10914.149805
3,77798.83,72170.544289,5628.285711
4,191050.39,179090.586025,11959.803975
5,105008.31,109824.773866,-4816.463866
6,81229.06,65644.277738,15584.782262
7,97483.56,100481.432771,-2997.872771
8,110352.25,111431.752024,-1079.502024
9,166187.94,169438.148435,-3250.208435
