# Predicting Restaurant Food Cost (Dynamics Project -14)

In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
import scipy
from scipy.stats import zscore
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso,Ridge
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
data1=pd.read_csv('restauranttrain.csv', sep='\t',na_values='-')
data2=pd.read_csv('restauranttest.csv', sep='\t',na_values='-')
# Here in this hackathon, we will be predicting the cost of the food served by the restaurants across different cities in India. 
# we will use our Data Science skills to investigate the factors that really affect the cost.
# Size of training set: 12,690 records
# Size of test set: 4,231 records
# FEATURES:
# TITLE: The feature of the restaurant which can help identify what and for whom it is suitable for.
# RESTAURANT_ID: A unique ID for each restaurant.
# CUISINES: The variety of cuisines that the restaurant offers.
# TIME: The open hours of the restaurant.
# CITY: The city in which the restaurant is located.
# LOCALITY: The locality of the restaurant.
# RATING: The average rating of the restaurant by customers.
# VOTES: The overall votes received by the restaurant.
# COST: The average cost of a two-person meal.(TARGET)

In [3]:
data1

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300
...,...,...,...,...,...,...,...,...,...
12685,QUICK BITES,13228,"North Indian, Burger, Kebab","12noon – 12midnight (Mon, Tue, Wed, Thu, Sun)...",Hyderabad,Gachibowli,3.8,546 votes,500
12686,"CASUAL DINING,BAR",9686,"Goan, Continental","12noon – 1am (Mon-Fri),11am – 5pm, 7pm – 1am...",Mumbai,Bandra Kurla Complex,4.3,1214 votes,1800
12687,LOUNGE,11133,"Finger Food, Continental, Asian, Chinese",12noon – 12:30AM (Mon-Sun),Navi Mumbai,Vashi,4.0,608 votes,1300
12688,CASUAL DINING,6134,"North Indian, South Indian, Chinese, Street Food",6am – 10:45pm (Mon-Sun),Chennai,Maduravoyal,3.5,32 votes,400


In [4]:
data2

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES
0,CASUAL DINING,4085,"North Indian, Chinese, Mughlai, Kebab",12noon – 12midnight (Mon-Sun),Noida,Sector 18,4.3,564 votes
1,QUICK BITES,12680,"South Indian, Fast Food, Pizza, North Indian",7am – 12:30AM (Mon-Sun),Mumbai,Grant Road,4.2,61 votes
2,CASUAL DINING,1411,"North Indian, Seafood, Biryani, Chinese",11am – 11:30pm (Mon-Sun),Mumbai,Marine Lines,3.8,350 votes
3,,204,Biryani,"9am – 10pm (Mon, Wed, Thu, Fri, Sat, Sun), 10:...",Faridabad,NIT,3.8,1445 votes
4,QUICK BITES,13453,"South Indian, Kerala",11am – 10pm (Mon-Sun),Kochi,Kaloor,3.6,23 votes
...,...,...,...,...,...,...,...,...
4226,CASUAL DINING,9057,"North Indian, Mughlai, Chinese",11:30am – 11:30pm (Mon-Sun),New Delhi,Punjabi Bagh,3.9,287 votes
4227,,1247,"Biryani, North Indian, Sandwich, Salad, Wraps",11am – 1am (Mon-Sun),Bangalore,HSR Layout,4.3,469 votes
4228,QUICK BITES,8617,"Continental, North Indian",9:30am – 10:30pm (Mon-Sun),Faridabad,Sector 86,3.7,53 votes
4229,QUICK BITES,6485,"Rolls, Beverages","11am – 11:30pm (Mon, Tue, Wed, Thu, Sat, Sun),...",Kochi,Kochi,,


In [5]:
dftrain=pd.DataFrame(data=data1)
dftrain

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,CASUAL DINING,9438,"Malwani, Goan, North Indian","11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)",Thane,Dombivali East,3.6,49 votes,1200
1,"CASUAL DINING,BAR",13198,"Asian, Modern Indian, Japanese",6pm – 11pm (Mon-Sun),Chennai,Ramapuram,4.2,30 votes,1500
2,CASUAL DINING,10915,"North Indian, Chinese, Biryani, Hyderabadi","11am – 3:30pm, 7pm – 11pm (Mon-Sun)",Chennai,Saligramam,3.8,221 votes,800
3,QUICK BITES,6346,"Tibetan, Chinese",11:30am – 1am (Mon-Sun),Mumbai,Bandra West,4.1,24 votes,800
4,DESSERT PARLOR,15387,Desserts,11am – 1am (Mon-Sun),Mumbai,Lower Parel,3.8,165 votes,300
...,...,...,...,...,...,...,...,...,...
12685,QUICK BITES,13228,"North Indian, Burger, Kebab","12noon – 12midnight (Mon, Tue, Wed, Thu, Sun)...",Hyderabad,Gachibowli,3.8,546 votes,500
12686,"CASUAL DINING,BAR",9686,"Goan, Continental","12noon – 1am (Mon-Fri),11am – 5pm, 7pm – 1am...",Mumbai,Bandra Kurla Complex,4.3,1214 votes,1800
12687,LOUNGE,11133,"Finger Food, Continental, Asian, Chinese",12noon – 12:30AM (Mon-Sun),Navi Mumbai,Vashi,4.0,608 votes,1300
12688,CASUAL DINING,6134,"North Indian, South Indian, Chinese, Street Food",6am – 10:45pm (Mon-Sun),Chennai,Maduravoyal,3.5,32 votes,400


# EDA

In [6]:
dftrain.info()
df1=dftrain.copy()
# Null values present in 'city','locality','rating' & 'votes' column
# datatypes - 7 object type & 2 integer type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12690 entries, 0 to 12689
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TITLE          12690 non-null  object
 1   RESTAURANT_ID  12690 non-null  int64 
 2   CUISINES       12690 non-null  object
 3   TIME           12690 non-null  object
 4   CITY           12578 non-null  object
 5   LOCALITY       12592 non-null  object
 6   RATING         12193 non-null  object
 7   VOTES          11486 non-null  object
 8   COST           12690 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 892.4+ KB


In [7]:
df1['CUISINES'].unique()

array(['Malwani, Goan, North Indian', 'Asian, Modern Indian, Japanese',
       'North Indian, Chinese, Biryani, Hyderabadi', ...,
       'North Indian, Burger, Kebab', 'Goan, Continental',
       'Finger Food, Continental, Asian, Chinese'], dtype=object)

In [8]:
le=LabelEncoder()

In [9]:
df1['CUISINES']=le.fit_transform(df1['CUISINES'])

In [10]:
df1['TIME'].unique()

array(['11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)', '6pm – 11pm (Mon-Sun)',
       '11am – 3:30pm, 7pm – 11pm (Mon-Sun)', ...,
       '12:1pm – 4am (Mon-Sun)', '12:30pm – 9:45pm (Mon-Sun)',
       '12noon – 1am (Mon-Fri),11am – 5pm, 7pm – 1am...'], dtype=object)

In [11]:
df1['TIME']=le.fit_transform(df1['TIME'])

In [12]:
df1['CITY'].unique()

array(['Thane', 'Chennai', 'Mumbai', 'Bangalore', 'Gurgaon', 'Hyderabad',
       'Kochi', 'Thane West', 'Andheri Lokhandwala', 'New Delhi',
       'Andheri West', 'Malad East', '682036', 'Bangalor', 'Navi Mumbai',
       'Bandra West', 'Delhi', 'Noida', 'Bangalore-560066',
       'Secunderabad', nan, 'India', 'Madhuranagar', 'Chennai Teynampet',
       'Faridabad', 'Chembur.', 'Maharashtra', 'opp gurudwara Shakurpur',
       'Telagana Land Line:040-48507016', 'Ghaziabad', 'Karnataka',
       'Kerala', 'Edappally', 'Kadavanthra', 'Ernakulam Circle kochi',
       'Bengalore', 'Near Reliance Fresh', 'Kilpauk', 'Bengaluru',
       'Kothaguda', 'Goregaon West', 'Banglore', 'Tamil Nadu', 'Kakkanad',
       'Kochi Elamkulam', 'Outer Ring Road', 'Mulund East',
       'Secunderabad main road near signal NMREC COLLEGE', 'Telangana',
       'Ponnuruni Kochi', 'Gachibowli', 'Semmancheri',
       '5th Main Teachers Colony Koramangala Block 1 Bangalore 560034',
       'Mumbai Mahim', 'Powai (Next to

In [13]:
imp=SimpleImputer(strategy='most_frequent')
df1['CITY']=imp.fit_transform(df1['CITY'].values.reshape(-1,1))
df1['CITY']=le.fit_transform(df1['CITY'])

In [14]:
df1['LOCALITY'].unique()

array(['Dombivali East', 'Ramapuram', 'Saligramam', ..., 'Market Road',
       'Near Perambur', 'Near Malviya Nagar'], dtype=object)

In [15]:
df1['LOCALITY']=imp.fit_transform(df1['LOCALITY'].values.reshape(-1,1))
df1['LOCALITY']=le.fit_transform(df1['LOCALITY'])

In [16]:
df1['RATING'].unique()

array(['3.6', '4.2', '3.8', '4.1', '4.0', '4.3', '3.9', '3.3', '3.4', nan,
       '4.5', '3.5', '4.4', '2.7', '3.7', '4.7', 'NEW', '3.1', '2.5',
       '4.6', '2.8', '3.0', '3.2', '2.6', '2.9', '4.9', '4.8', '2.4',
       '2.3', '2.0', '2.1', '2.2'], dtype=object)

In [17]:
dr=df1[df1['RATING']=='NEW']
dr
# total 707 restaurants are categorised as 'New'

Unnamed: 0,TITLE,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
62,QUICK BITES,7841,2898,555,173,486,NEW,,200
69,,11096,3607,555,252,490,NEW,,200
111,CASUAL DINING,7095,2154,331,222,187,NEW,,800
117,QUICK BITES,8091,2946,555,252,293,NEW,,500
120,QUICK BITES,4993,3463,1979,44,1385,NEW,,150
...,...,...,...,...,...,...,...,...,...
12602,QUICK BITES,11176,3668,1156,222,636,NEW,,200
12613,CASUAL DINING,397,310,1123,119,264,NEW,,950
12630,CASUAL DINING,9959,4088,2254,119,364,NEW,,1200
12658,,9384,2549,758,173,233,NEW,,400


In [37]:
df1['RATING']=imp.fit_transform(df1['RATING'].values.reshape(-1,1))
df1['RATING']=le.fit_transform(df1['RATING'])

In [19]:
df1['VOTES'].unique()

array(['49 votes', '30 votes', '221 votes', ..., '2723 votes',
       '1426 votes', '1214 votes'], dtype=object)

In [21]:
df1n=df1.dropna(axis=0)
# dropping of the rows with missing values

In [38]:
df1n['VOTES']=le.fit_transform(df1n['VOTES'])

In [39]:
df11n=df1n.drop('TITLE',axis=1)
df11n
# the column 'Title' can be dropped for model training purpose

Unnamed: 0,RESTAURANT_ID,CUISINES,TIME,CITY,LOCALITY,RATING,VOTES,COST
0,9438,2627,739,320,285,16,1278,1200
1,13198,289,1885,67,1046,22,950,1500
2,10915,2976,661,67,1101,18,712,800
3,6346,4130,275,222,136,21,787,800
4,15387,1766,615,222,620,18,470,300
...,...,...,...,...,...,...,...,...
12685,13228,2934,1144,131,342,18,1349,500
12686,9686,2143,1177,222,133,23,191,1800
12687,11133,2088,1137,239,1342,20,1428,1300
12688,6134,3506,1837,67,631,15,996,400


# Model Training 

In [25]:
x=df11n.iloc[:,0:-1]
y=df11n.iloc[:,-1]

In [26]:
x.shape,y.shape

((11486, 7), (11486,))

In [27]:
sc=StandardScaler()
x=sc.fit_transform(x)

In [28]:
maxr=0
for i in range(41,100):
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=i,test_size=0.3)
    lm=LinearRegression()
    lm.fit(x_train,y_train)
    pred=lm.predict(x_test)
    score=lm.score(x_train,y_train)
    print('accuracy score correspondng to random state',i,'is:',score)
    if score>maxr:
        maxr=score
        finalr=i
print()
print('maximum score corresponding to random state',finalr,'is',maxr)

accuracy score correspondng to random state 41 is: 0.11262614688540307
accuracy score correspondng to random state 42 is: 0.10712027909904298
accuracy score correspondng to random state 43 is: 0.10695987398342632
accuracy score correspondng to random state 44 is: 0.10550075253421398
accuracy score correspondng to random state 45 is: 0.1080501275480098
accuracy score correspondng to random state 46 is: 0.11264359796716329
accuracy score correspondng to random state 47 is: 0.10849478920738331
accuracy score correspondng to random state 48 is: 0.11446653407634731
accuracy score correspondng to random state 49 is: 0.1063185143348152
accuracy score correspondng to random state 50 is: 0.10147317963683578
accuracy score correspondng to random state 51 is: 0.10917028444734878
accuracy score correspondng to random state 52 is: 0.10695827068400109
accuracy score correspondng to random state 53 is: 0.11518552069437206
accuracy score correspondng to random state 54 is: 0.10767177163410069
accuracy

In [29]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=98,test_size=0.3)
lm.fit(x_train,y_train)
## fitting the training part with algorithm. 
print('score:',lm.score(x_train,y_train))


score: 0.12073700132773035


In [30]:
ls=Lasso()
alphavalue={'alpha':[1.0,0.1,0.01,0.001,0]}
grid=GridSearchCV(ls,param_grid=alphavalue)
grid.fit(x,y)
print(grid)
print(grid.best_score_)
print(grid.best_params_)

GridSearchCV(cv=None, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1.0, 0.1, 0.01, 0.001, 0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
0.10792380736798764
{'alpha': 1.0}


In [31]:
ls=Lasso(alpha=1.0)
ls.fit(x_train,y_train)
ls.score(x_train,y_train)

0.12071890685407329

In [32]:
rf=RandomForestRegressor(n_estimators=450,random_state=50)
rf.fit(x_train,y_train)
print(rf.score(x_train,y_train))
# used ensemble technique to achieve better score

0.9214091010479621


In [40]:
df2=pd.DataFrame(data=data2)
df2['CUISINES']=le.fit_transform(df2['CUISINES'])
df2['TIME']=le.fit_transform(df2['TIME'])
df2['CITY']=imp.fit_transform(df2['CITY'].values.reshape(-1,1))
df2['CITY']=le.fit_transform(df2['CITY'])
df2['LOCALITY']=imp.fit_transform(df2['LOCALITY'].values.reshape(-1,1))
df2['LOCALITY']=le.fit_transform(df2['LOCALITY'])
df2['RATING']=imp.fit_transform(df2['RATING'].values.reshape(-1,1))
df2['RATING']=le.fit_transform(df2['RATING'])
df2n=df2.dropna(axis=0)
df2n['VOTES']=le.fit_transform(df2n['VOTES'])
df22n=df2n.drop('TITLE',axis=1)


In [34]:
x_test=df22n
x_test=sc.fit_transform(x_test)

In [35]:
predr=rf.predict(x_test)
predr

array([1221.55555556,  496.35555556,  725.33333333, ...,  738.22222222,
        690.42666667,  531.93333333])

save the model 

In [36]:
joblib.dump(rf,'rfcost.obj')

['rfcost.obj']