In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.formula.api as sm

In [2]:
df = pd.read_csv("50_Startups.csv")

In [3]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = df.iloc[0:, :-1].values
Y = df.iloc[0:, 4].values

In [5]:
print("The Shape of X is {}".format(X.shape))
print("The Shape of Y is {}".format(Y.shape))

The Shape of X is (50, 4)
The Shape of Y is (50,)


In [6]:
#Encoding The Coloun State into 0's & 1'S
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray().astype('int32')

In [7]:
#To avoid the dummy variable trap
X = X[:, 1:]

In [8]:
#Splitting The Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [9]:
#Checking is the split was right
print("The Shape of X_train is {}".format(X_train.shape))
print("The Shape of X_test is {}".format(X_test.shape))
print("The Shape of Y_train is {}".format(y_train.shape))
print("The Shape of Y_test is {}".format(y_test.shape))

The Shape of X_train is (40, 5)
The Shape of X_test is (10, 5)
The Shape of Y_train is (40,)
The Shape of Y_test is (10,)


In [10]:
#Fitting The Model in the train set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
#predicting The results
y_pred = regressor.predict(X_test)

In [12]:
#Comparing The Results
y_pred.astype('int32')

array([103015, 132581, 132448,  71975, 178537, 116161,  67851,  98791,
       113969, 167921])

In [13]:
y_test.astype('int32')

array([103282, 144259, 146121,  77798, 191050, 105008,  81229,  97483,
       110352, 166187])

In [14]:
print('Coefficients: \n', regressor.coef_)

Coefficients: 
 [-9.59283817e+02  6.99250317e+02  7.73467731e-01  3.28850211e-02
  3.66099735e-02]


In [15]:
# The mean squared error
print("The Mean Squared Error is: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print("Variance Score: %.2f" % r2_score(y_test, y_pred))
#Accuracy
acc = (r2_score(y_test, y_pred)*100).astype('int32')
print("Accuracy is: {}%".format(acc))

The Mean Squared Error is: 83503182.37
Variance Score: 0.93
Accuracy is: 93%


Building the optimal model using backward elimination technique

In [17]:
#Before we make the optimal model out of it, we need to add a coloum at first of 1's so that b0 isn't empyt
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)

From the below OLS Regression Results we can find out the Variable having high P_value and reject it untill we have two values

In [22]:
X_opt = X[:, [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 09 Jun 2019",Prob (F-statistic):,1.34e-27
Time:,00:43:12,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.855,7.281,0.000,3.63e+04,6.4e+04
x1,198.7542,3371.026,0.059,0.953,-6595.103,6992.611
x2,-42.0063,3256.058,-0.013,0.990,-6604.161,6520.148
x3,0.8060,0.046,17.368,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.783,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.267
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


<h2>After Calculating the result is</h2>
We got

In [30]:
X_opt = X[:, [0, 3]]
list(X_opt[0:5])

[array([     1, 165349]),
 array([     1, 162597]),
 array([     1, 153441]),
 array([     1, 144372]),
 array([     1, 142107])]