<h4>Importing modules

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import math

<h4>Reading dataset

In [2]:
df = pd.read_csv("dataset/Restaurant_Profit_Data.csv")
df.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,City,Profit
0,138671.8,167497.2,475918.1,Chicago,202443.83
1,153151.59,164745.7,448032.53,Mumbai,201974.06
2,102919.55,155589.51,412068.54,Tokyo,201232.39
3,120445.85,146520.41,387333.62,Chicago,193083.99
4,93165.77,144255.34,370302.42,Tokyo,176369.94


In [3]:
df.shape

(50, 5)

In [4]:
# One hot encoding of categorical column called city
city_dummy=pd.get_dummies(df['City'])
city_dummy.head()

Unnamed: 0,Chicago,Mumbai,Tokyo
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,0,0,1


In [5]:
city_dummy=pd.get_dummies(df['City'], drop_first=True)
city_dummy.head()

Unnamed: 0,Mumbai,Tokyo
0,0,0
1,1,0
2,0,1
3,0,0
4,0,1


In [6]:
new_df = pd.concat([df,city_dummy], axis=1)
new_df.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,City,Profit,Mumbai,Tokyo
0,138671.8,167497.2,475918.1,Chicago,202443.83,0,0
1,153151.59,164745.7,448032.53,Mumbai,201974.06,1,0
2,102919.55,155589.51,412068.54,Tokyo,201232.39,0,1
3,120445.85,146520.41,387333.62,Chicago,193083.99,0,0
4,93165.77,144255.34,370302.42,Tokyo,176369.94,0,1


In [7]:
new_df.drop(['City'], axis=1, inplace=True)
new_df.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,Profit,Mumbai,Tokyo
0,138671.8,167497.2,475918.1,202443.83,0,0
1,153151.59,164745.7,448032.53,201974.06,1,0
2,102919.55,155589.51,412068.54,201232.39,0,1
3,120445.85,146520.41,387333.62,193083.99,0,0
4,93165.77,144255.34,370302.42,176369.94,0,1


In [8]:
new_df = new_df[["Miscellaneous_Expenses", "Food_Innovation_Spend", "Advertising", "Mumbai", "Tokyo", "Profit"]]
new_df.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,Mumbai,Tokyo,Profit
0,138671.8,167497.2,475918.1,0,0,202443.83
1,153151.59,164745.7,448032.53,1,0,201974.06
2,102919.55,155589.51,412068.54,0,1,201232.39
3,120445.85,146520.41,387333.62,0,0,193083.99
4,93165.77,144255.34,370302.42,0,1,176369.94


<h4>Preparing dependent and independent variable

In [9]:
# creating feature matrix and dependent variable vector
x=new_df.iloc[:,0:-1]
y=new_df.iloc[:,5]
x.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,Mumbai,Tokyo
0,138671.8,167497.2,475918.1,0,0
1,153151.59,164745.7,448032.53,1,0
2,102919.55,155589.51,412068.54,0,1
3,120445.85,146520.41,387333.62,0,0
4,93165.77,144255.34,370302.42,0,1


<h4>Split the Dataset into Training and Test Dataset

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, shuffle=False)

In [11]:
len(X_train)

35

<h4> Creating and Training a Model for Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [13]:
print(model.intercept_)

59609.76074441006


In [14]:
print(model.coef_)

[-4.65661338e-02  8.07413705e-01  2.22656384e-02  5.93112256e+03
  1.78138518e+03]


In [15]:
prediction = model.predict(X_test)
prediction

array([ 99121.5245791 ,  84840.12163536, 104866.2556507 ,  78741.2770224 ,
        98449.63122739,  88828.46057989,  85373.42658791,  85188.81667687,
        68732.55187985,  78613.42420255,  56422.18776935,  65419.35882969,
        60978.337698  ,  59381.70336516,  62842.96951076])

In [35]:
y_test.values

array([106661.51, 100890.19, 100131.14,  91411.06,  91187.76,  88421.91,
        87980.83,  81680.49,  79940.98,  75382.33,  75108.08,  59672.75,
        52741.73,  45855.41,  24863.4 ])

In [40]:
print(y_test.shape,prediction.shape)

(15,) (15,)


In [43]:
compare_df=pd.DataFrame(data=y_test, columns=['y_test'])
compare_df['prediction'] = prediction
compare_df.head()

# compare_df=pd.DataFrame({'Actual':y_test.flatten(),'Predicted':prediction.flatten()})
# compare_df.head()

Unnamed: 0,y_test,prediction
0,,99121.524579
1,,84840.121635
2,,104866.255651
3,,78741.277022
4,,98449.631227


In [32]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, prediction)
MSE

185394179.48436058

In [19]:
RMSE = math.sqrt(MSE)
RMSE


13615.953124345007

In [20]:
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, prediction)
MAE

10226.239884469172

In [21]:
from sklearn.metrics import r2_score
r_square = r2_score(y_test, prediction)
print('R-Square Error of Simple Linear Regression:', r_square)

R-Square Error of Simple Linear Regression: 0.6167464436796346
