In [172]:
import pandas as pd

In [173]:
data=pd.read_csv("50_Startups.csv")

In [174]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [175]:
data.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [176]:
y=data["Profit"]

In [177]:
X=data[['R&D Spend', 'Administration', 'Marketing Spend', 'State']]

In [178]:
# state is a catagorical variable so we need to transfor/encode it.

In [179]:
X=pd.get_dummies(X,drop_first=True)  # this will encode all catagorical variables in X and then add them to X automatically
# drop_first will drop any one column so as to remove redundancy

In [180]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [181]:
from sklearn.model_selection import train_test_split

In [182]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [183]:
from sklearn.linear_model import LinearRegression

In [184]:
model=LinearRegression()

In [185]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [186]:
y_pred=model.predict(X_test)

In [187]:
y_pred

array([126362.87908255,  84608.45383634,  99677.49425147,  46357.46068582,
       128750.48288504,  50912.4174188 , 109741.35032702, 100643.24281647,
        97599.27574594, 113097.42524432])

In [188]:
y_test

13    134307.35
39     81005.76
30     99937.59
45     64926.08
17    125370.37
48     35673.41
26    105733.54
25    107404.34
32     97427.84
19    122776.86
Name: Profit, dtype: float64

In [189]:
residual=134307-126362 #error #taken only 1 value

In [190]:
error=residual/134307*100   # error % 

In [191]:
error

5.9155516838288404

In [192]:
accuracy=100-error

In [193]:
accuracy

94.08444831617116

In [194]:
X.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida',
       'State_New York'],
      dtype='object')

# OLS Method

In [195]:
import statsmodels.formula.api as sm   # for OLS model

In [196]:
model_ols=sm.OLS(endog=y,exog=X).fit()    #create OLS model 
#also train since we've already passed our X and y
# endog= dependent variable , exog=independent variable

In [197]:
model_ols.summary() #summary

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.988
Model:,OLS,Adj. R-squared:,0.986
Method:,Least Squares,F-statistic:,727.1
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,7.87e-42
Time:,20:53:10,Log-Likelihood:,-545.15
No. Observations:,50,AIC:,1100.0
Df Residuals:,45,BIC:,1110.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.7182,0.066,10.916,0.000,0.586,0.851
Administration,0.3113,0.035,8.885,0.000,0.241,0.382
Marketing Spend,0.0786,0.023,3.429,0.001,0.032,0.125
State_Florida,3464.4536,4905.406,0.706,0.484,-6415.541,1.33e+04
State_New York,5067.8937,4668.238,1.086,0.283,-4334.419,1.45e+04

0,1,2,3
Omnibus:,1.355,Durbin-Watson:,1.288
Prob(Omnibus):,0.508,Jarque-Bera (JB):,1.241
Skew:,-0.237,Prob(JB):,0.538
Kurtosis:,2.391,Cond. No.,828000.0


In [198]:
import numpy as np

In [199]:
ones=np.ones((50,1))

In [200]:
X_new=np.append(arr=ones,values=X,axis=1) # add ones and X
#axis=0 means row wise add
# axis=1 means col wise add

In [201]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [202]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,1.34e-27
Time:,20:53:11,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,0.8060,0.046,17.369,0.000,0.712,0.900
x2,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x3,0.0270,0.017,1.574,0.123,-0.008,0.062
x4,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x5,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [203]:
#thus by backward elemination we can see that only 
# const and x1 we can predict
# sowe will removeall variables with pvalue>significane level

In [204]:
X_new=X_new[:,0:5] #removed x5 i.e new york

In [205]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [206]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,8.49e-29
Time:,20:53:12,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,0.8060,0.046,17.606,0.000,0.714,0.898
x2,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x3,0.0270,0.017,1.592,0.118,-0.007,0.061
x4,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [207]:
X_new=X_new[:,0:4] #removed x4 

In [208]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [209]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,4.53e-30
Time:,20:53:13,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [210]:
X_new=X_new[:,0:3] #removed x3 

In [211]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [212]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,426.8
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,7.29e-31
Time:,20:53:13,Log-Likelihood:,-526.83
No. Observations:,50,AIC:,1060.0
Df Residuals:,47,BIC:,1065.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.489e+04,6016.718,9.122,0.000,4.28e+04,6.7e+04
x1,0.8621,0.030,28.589,0.000,0.801,0.923
x2,-0.0530,0.049,-1.073,0.289,-0.152,0.046

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.189
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.449
Skew:,-0.961,Prob(JB):,3.63e-05
Kurtosis:,5.474,Cond. No.,665000.0


In [213]:
X_new=X_new[:,[0,1,2]] 

In [214]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [215]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,426.8
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,7.29e-31
Time:,20:53:14,Log-Likelihood:,-526.83
No. Observations:,50,AIC:,1060.0
Df Residuals:,47,BIC:,1065.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.489e+04,6016.718,9.122,0.000,4.28e+04,6.7e+04
x1,0.8621,0.030,28.589,0.000,0.801,0.923
x2,-0.0530,0.049,-1.073,0.289,-0.152,0.046

0,1,2,3
Omnibus:,14.678,Durbin-Watson:,1.189
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.449
Skew:,-0.961,Prob(JB):,3.63e-05
Kurtosis:,5.474,Cond. No.,665000.0


In [216]:
X_new=X_new[:,0:2] #removed x2 

In [217]:
model_ols=sm.OLS(endog=y,exog=X_new).fit()  

In [218]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 16 Apr 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,20:53:15,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
