# Restaurant


# Data Extraction

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
label_encoder =LabelEncoder()
from sklearn.model_selection import RepeatedStratifiedKFold

#reading the dataset
df = pd.read_csv('zomato.csv',header = 0)
dataset = df.copy()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
url                            51717 non-null object
address                        51717 non-null object
name                           51717 non-null object
online_order                   51717 non-null object
book_table                     51717 non-null object
rate                           43942 non-null object
votes                          51717 non-null int64
phone                          50509 non-null object
location                       51696 non-null object
rest_type                      51490 non-null object
dish_liked                     23639 non-null object
cuisines                       51672 non-null object
approx_cost(for two people)    51371 non-null object
reviews_list                   51717 non-null object
menu_item                      51717 non-null object
listed_in(type)                51717 non-null object
listed_in(city)                51717 non-nul

# Data Preprocessing

In [4]:
# Removing '/5' from rate column
df['rate'] = df['rate'].apply(lambda x: str(x).split('/')[0])
df['rate'].value_counts()
df['rate'].isnull().sum()

0

### Label encoding

In [5]:
df['book_table']= label_encoder.fit_transform(df['book_table'])
df['online_order']= label_encoder.fit_transform(df['online_order'])
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,1,1,4.1,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,1,0,4.1,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,1,0,3.8,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,0,0,3.7,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,0,0,3.8,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [6]:
#Counting the number of dishes liked and label encoding
df['dish_count']=df['dish_liked'].str.split(',').str.len()
df['dish_count'] = df['dish_count'].replace(np.nan, 0)
df['dish_count'].unique()

array([7., 1., 2., 0., 3., 4., 6., 5.])

In [7]:
#Counting the number of cusines offered and label encoding
df['cusine_count']=df['cuisines'].str.split(',').str.len()

df['cusine_count'] = df['cusine_count'].replace(np.nan, 0)
df['cusine_count'].unique()

array([3., 2., 1., 4., 5., 8., 7., 6., 0.])

In [8]:
# deleting unnecessary data
del df['url']
del df['address']
del df['rest_type']
del df['dish_liked']
del df['cuisines']
del df['listed_in(type)']
del df['listed_in(city)']
del df['reviews_list']
del df['menu_item']
del df['phone']


In [9]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,approx_cost(for two people),dish_count,cusine_count
0,Jalsa,1,1,4.1,775,Banashankari,800,7.0,3.0
1,Spice Elephant,1,0,4.1,787,Banashankari,800,7.0,3.0
2,San Churro Cafe,1,0,3.8,918,Banashankari,800,7.0,3.0
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,300,1.0,2.0
4,Grand Village,0,0,3.8,166,Basavanagudi,600,2.0,2.0


In [10]:
#renaming Column
df = df.rename(columns = {"approx_cost(for two people)" : "avg_cost"})
df = df.rename(columns = {"rate" : "ratings"})
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce')
df.head()


Unnamed: 0,name,online_order,book_table,ratings,votes,location,avg_cost,dish_count,cusine_count
0,Jalsa,1,1,4.1,775,Banashankari,800,7.0,3.0
1,Spice Elephant,1,0,4.1,787,Banashankari,800,7.0,3.0
2,San Churro Cafe,1,0,3.8,918,Banashankari,800,7.0,3.0
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,300,1.0,2.0
4,Grand Village,0,0,3.8,166,Basavanagudi,600,2.0,2.0


In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df.head()

Unnamed: 0,name,online_order,book_table,ratings,votes,location,avg_cost,dish_count,cusine_count
0,Jalsa,1,1,4.1,775,Banashankari,800,7.0,3.0
1,Spice Elephant,1,0,4.1,787,Banashankari,800,7.0,3.0
2,San Churro Cafe,1,0,3.8,918,Banashankari,800,7.0,3.0
3,Addhuri Udupi Bhojana,0,0,3.7,88,Banashankari,300,1.0,2.0
4,Grand Village,0,0,3.8,166,Basavanagudi,600,2.0,2.0


In [12]:
#Records with null values are dropped

df['ratings'] = df['ratings'].astype(float)
df['votes'] = pd.to_numeric(df['votes'], errors='coerce')
df['avg_cost'] = pd.to_numeric(df['avg_cost'], errors='coerce')
df=df.dropna()


In [13]:
df.corr()

Unnamed: 0,online_order,book_table,ratings,votes,avg_cost,dish_count,cusine_count
online_order,1.0,0.065909,0.151704,0.056661,0.072401,0.162356,0.105323
book_table,0.065909,1.0,0.229809,0.243101,0.352069,0.267618,0.171986
ratings,0.151704,0.229809,1.0,0.347258,0.195634,0.533398,0.140001
votes,0.056661,0.243101,0.347258,1.0,0.273401,0.40958,0.182707
avg_cost,0.072401,0.352069,0.195634,0.273401,1.0,0.398381,0.336868
dish_count,0.162356,0.267618,0.533398,0.40958,0.398381,1.0,0.267622
cusine_count,0.105323,0.171986,0.140001,0.182707,0.336868,0.267622,1.0


## Linear Regression

In [14]:
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
x= df[['online_order','book_table','votes','avg_cost','dish_count','cusine_count']]
y=df[['ratings']]

Xtrain, Xtest, ytrain, ytest = train_test_split(x, y, random_state=0,test_size=0.2)

ytest.shape

(6953, 1)

## SCALING- Fit and Transforming Training Data ,  AND Transforming Test 

In [15]:
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled=scaler.transform(Xtest)
Xtrain_scaled.max(axis=0)

array([1., 1., 1., 1., 1., 1.])

In [16]:
lin_regr = linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
lin_regr.fit(Xtrain_scaled, ytrain)
y_pred = lin_regr.predict(Xtest_scaled)
y_pred.reshape(-1,1)
from sklearn.metrics import mean_absolute_error

result1=mean_absolute_error(y_pred,ytest)


In [17]:
ytest.head()

Unnamed: 0,ratings
27410,3.5
47642,3.3
27074,4.0
21479,3.7
47980,3.8


In [18]:
y_pred[:5]

array([[3.48406772],
       [3.48905481],
       [3.88100604],
       [3.44311892],
       [3.54021464]])

In [19]:
result1=mean_absolute_error(y_pred,ytest)
result1

0.2549452431361428

## Support Vector Machine


In [21]:
from sklearn.svm import SVR # "Support vector classifier"
SVR_model = SVR()


In [22]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
from sklearn.model_selection import GridSearchCV 
param_grid = {'C': [  10, 100],  
              'gamma': [1, 0.1], 
              'kernel': ['rbf']}
grid = GridSearchCV(SVR(), param_grid, refit = True, verbose = 3 ,n_jobs=7, )
grid.fit(Xtrain_scaled, ytrain)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=7)]: Done   4 out of  12 | elapsed:   38.2s remaining:  1.3min
[Parallel(n_jobs=7)]: Done   9 out of  12 | elapsed:  1.2min remaining:   24.3s
[Parallel(n_jobs=7)]: Done  12 out of  12 | elapsed:  1.4min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=7,
             param_grid={'C': [10, 100], 'gamma': [1, 0.1], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [23]:
print(grid.best_params_) 
grid_predictions = grid.predict(Xtest_scaled)

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}


In [24]:
#y_pred1=SVR_model.predict(Xtest)
result2=mean_absolute_error(grid_predictions,ytest)
result2

0.23338189452818234

## decision Tree

In [25]:
from sklearn import tree
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline
rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
pipe_tree = make_pipeline(tree.DecisionTreeRegressor(random_state=1))
depths = np.arange(20, 30)
num_leafs = [1,5,10]
param_grid = [{'decisiontreeregressor__max_depth':depths,
              'decisiontreeregressor__min_samples_leaf':num_leafs}]
gs = GridSearchCV(estimator=pipe_tree, param_grid=param_grid, scoring=rmse_scorer, cv=10)

gs.fit(Xtrain_scaled, ytrain)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('decisiontreeregressor',
                                        DecisionTreeRegressor(criterion='mse',
                                                              max_depth=None,
                                                              max_features=None,
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split=None,
                                                              min_samples_leaf=1,
                                                              min_samples_split=2,
                                                              min_weight_fraction_leaf=0.0,
                                                              presort=False,
      

In [26]:
print(gs.best_params_) 

{'decisiontreeregressor__max_depth': 27, 'decisiontreeregressor__min_samples_leaf': 1}


In [27]:

y_pred2=gs.predict(Xtest_scaled)
result3=mean_absolute_error(y_pred2,ytest)
result3

0.0893824591007898

## Random forrest

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor()
# Train the model on training data
#rf.fit(Xtrain, ytrain)



n_estimators = [775,780]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [18,19,20]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [ False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
CV_rfc = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
CV_rfc.fit(Xtrain_scaled, ytrain)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  6.3min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_state=N

In [36]:
CV_rfc.best_params_

{'n_estimators': 775,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [37]:
y_pred4=CV_rfc.predict(Xtest_scaled)
result4=mean_absolute_error(y_pred4,ytest)
result4

0.09058183168445788

In [38]:
y_pred4[:5]

array([3.49967604, 3.34163748, 3.99675055, 3.24392075, 3.77908553])

## KNN

In [39]:
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
from math import sqrt
from sklearn.metrics import mean_squared_error
rmse_val = []
for K in range(20):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(Xtrain_scaled, ytrain)  #fit the model
    pred=model.predict(Xtest_scaled) #make prediction on test set
    error = (mean_absolute_error(ytest,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 0.09106860348051202
RMSE value for k=  2 is: 0.10417805263914856
RMSE value for k=  3 is: 0.1183757610623712
RMSE value for k=  4 is: 0.13027829713792607
RMSE value for k=  5 is: 0.1413461815043866
RMSE value for k=  6 is: 0.1504842034613357
RMSE value for k=  7 is: 0.15858519446898567
RMSE value for k=  8 is: 0.16630770890263194
RMSE value for k=  9 is: 0.17373156271473544
RMSE value for k=  10 is: 0.17976556881921474
RMSE value for k=  11 is: 0.18545428395852673
RMSE value for k=  12 is: 0.19048252552854883
RMSE value for k=  13 is: 0.1947239155206939
RMSE value for k=  14 is: 0.19839740297096833
RMSE value for k=  15 is: 0.2014995925020375
RMSE value for k=  16 is: 0.20420771609377245
RMSE value for k=  17 is: 0.20683073747261022
RMSE value for k=  18 is: 0.209026479377407
RMSE value for k=  19 is: 0.21079579431824202
RMSE value for k=  20 is: 0.21241766144110452


In [40]:
from sklearn.neighbors import KNeighborsRegressor
reg = KNeighborsRegressor(n_neighbors=1)
reg.fit(Xtrain_scaled, ytrain)
y_pred5=reg.predict(Xtest_scaled)
result5=mean_absolute_error(y_pred5,ytest)
result5

0.09106860348051202

## ADA Boost

In [41]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
base = DecisionTreeRegressor(max_depth=27)
ada=AdaBoostRegressor(base_estimator=base)
search_grid={'n_estimators':[10, 50, 100, 500],'learning_rate':[.001,0.01,.1]}
search=GridSearchCV(estimator=ada,param_grid=search_grid,scoring=rmse_scorer,n_jobs=6,cv=3, verbose = 3 )
search.fit(Xtrain_scaled,ytrain)
search.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   37.0s
[Parallel(n_jobs=6)]: Done  36 out of  36 | elapsed:  1.1min finished
  y = column_or_1d(y, warn=True)


{'learning_rate': 0.001, 'n_estimators': 100}

In [42]:
y_pred6=search.predict(Xtest_scaled)
result6=mean_absolute_error(y_pred6,ytest)
result6

0.08231928766267105

In [43]:
y_pred6[:5]

array([3.5       , 3.34137931, 4.        , 3.1       , 3.8       ])

## XGBOOST

In [44]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn import tree
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import make_scorer
from sklearn.pipeline import make_pipeline

from sklearn.metrics import mean_absolute_error
rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [45]:
params = {
    
    'max_depth':[ 27,30],
    'min_child_weight': [1],
    'eta':[.12, 0.1],
    'subsample': [1],
    'colsample_bytree': [1],
    'n_estimators': [450, 400,500],
    # Other parameters
    'objective':['reg:linear']
}

xgb = XGBRegressor() 

grid = GridSearchCV(xgb, params ,scoring=rmse_scorer,n_jobs=6,cv=3, verbose = 3)
grid.fit(Xtrain_scaled, ytrain)
grid.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.7min
[Parallel(n_jobs=6)]: Done  36 out of  36 | elapsed:  2.9min finished




{'colsample_bytree': 1,
 'eta': 0.12,
 'max_depth': 30,
 'min_child_weight': 1,
 'n_estimators': 450,
 'objective': 'reg:linear',
 'subsample': 1}

In [46]:
y_pred7=grid.predict(Xtest_scaled)
result7=mean_absolute_error(y_pred7,ytest)
result7

  "because it will generate extra copies and increase " +


0.0860856859148723