In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

The basic idea of analyzing the Zomato dataset is to get a fair idea about the factors affecting the aggregate rating of each restaurant, establishment of different types of restaurant at different places, Bengaluru being one such city has more than 12,000 restaurants with restaurants serving dishes from all over the world. With each day new restaurants opening the industry has’nt been saturated yet and the demand is increasing day by day. Inspite of increasing demand it however has become difficult for new restaurants to compete with established restaurants. Most of them serving the same food. Bengaluru being an IT capital of India. Most of the people here are dependent mainly on the restaurant food as they don’t have time to cook for themselves. With such an overwhelming demand of restaurants it has therefore become important to study the demography of a location. What kind of a food is more popular in a locality. Do the entire locality loves vegetarian food. If yes then is that locality populated by a particular sect of people for eg. Jain, Marwaris, Gujaratis who are mostly vegetarian. These kind of analysis can be done using the data, by studying different factors.

# Problem Statement :¶

In this challenge, we are analysing the Zomato Restaurant dataset to find the more insights about the Restaurant business.

# Data Description :¶

url : contains the url of the restaurant in the zomato website

address : contains the address of the restaurant in Bengaluru

name : contains the name of the restaurant

online_order : whether online ordering is available in the restaurant or not

book_table : table book option available or not

rate : contains the overall rating of the restaurant out of 5

votes : contains total number of rating for the restaurant as of the above mentioned date

phone : contains the phone number of the restaurant

location : contains the neighborhood in which the restaurant is located

rest_type : restaurant type

dish_liked : dishes people liked in the restaurant

cuisines : food styles, separated by comma

approx_cost(for two people) : contains the approximate cost for meal for two people

reviews_list : list of tuples containing reviews for the restaurant, each tuple consists of two values, rating and review by the customer

menu_item : contains list of menus available in the restaurant

listed_in(type) : type of meal

listed_in(city) : contains the neighborhood in which the restaurant is listed

In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12, 5);
sns.set_style('whitegrid')
import matplotlib.colors as mcolors

!pip install geopandas
import geopandas as gpd
import folium
from folium.plugins import HeatMap

import os
import plotly.express as px 
import plotly.graph_objs as go
from plotly.offline import iplot
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=False)
from geopy.geocoders import Nominatim

! pip install dexplot
import dexplot as dxp

import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import plotly.offline as py
#from wordcloud import WordCloud
#from nltk.tokenize import sent_tokenize, word_tokenize
#from nltk.util import ngrams
#from sklearn.feature_extraction.text import CountVectorizer

# Real-world/Business Objectives and Constraints :

The cost of a mis-classification can be high.

No strict latency concerns.

It will help everyone to unterstand the insights of a restaurant business.](http://)

# Breakdown of this notebook:

1.Loading the dataset: Load the data and import the libraries.

2.Data Cleaning:

* Deleting redundant columns.
* Renaming the columns.
* Dropping duplicates.
* Cleaning individual columns. 

3.Data Visualization: Using plots to find relations between the features.

4.Finding the best cheap restaurants:

* Cheapest, Highest rated and largely voted.
* Is there a relation between cuisine,location and the cost? 

5.Exploring the best expensive restaurants.
* Restaurants that are expensive, Highest rated and largely voted.
* Is there a relation between restaurant type,location and the cost?

In [None]:
import matplotlib.ticker as mtick
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
#importing and reading the dataset
df = pd.read_csv('/kaggle/input/zomato-bangalore-restaurants/zomato.csv')

In [None]:
#topmost values in the dataset
df.head()

In [None]:
#shape of the dataset
df.shape

51717 columns and 17 columns

In [None]:
#gives information of dataset 
df.info()

In [None]:
#dataset description
df.describe()

In [None]:
#missing values in our dataset
df.isnull().sum()

#rate , phone, location, rest_type, dish_liked, cuisines, approx_cost(for two people) are the feature which have missing values...dish_liked is missing most values.


In [None]:
#dataset feature data-dtypes
df.dtypes

In [None]:
# Let's see the null values of the above dataset in terms of the percentage 

print("Checking the Null or na percentage")
((df.isnull() | df.isna()).sum()*100 / df.index.size)

 dish_liked is missing 54% of the values after that rate is about 15% etc...



In [None]:
#the columns in the dataset
df.columns

In [None]:
#top 10 most url's  in our dataset
df['url'][:10]

In [None]:
#top 10 address in our dataset
df['address'][:10]

In [None]:
#the topmost names in our dataset
df['name'][:10]

In [None]:
#the topmost phone in our dataset
df['phone'][:10]

In [None]:
df = df.drop(['url', 'phone'] , axis=1)
#dropping unncessary columns 

# Checking for duplicate values

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
#Drop null values

In [None]:
#remove the nan values from the dataset
df.dropna(how='any', inplace=True)
df.isnull().sum()

****# Renaming columes appropriately

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df = df.rename(columns = {'approx_cost(for two people)':'cost','listed_in(type)':'type',
                                  'listed_in(city)':'city'})
df.columns

In [None]:
df.head()

# # cleaning the dataset

In [None]:
df['cost'].unique()

# cleaning the dataset

In [None]:
df['cost'].unique()

In [None]:

df['cost'] = df['cost'].apply(lambda x:x.replace(',', ''))
df['cost'] = df['cost'].astype(float)

In [None]:
df['cost'].dtypes

In [None]:
print(df['cost'].unique())

print('---'*10)

df.dtypes

In [None]:
df.columns

In [None]:
df['rate'].unique()

In [None]:
#removing New
df = df.loc[df['rate']!='NEW']

In [None]:
#df['rate'] = df['rate'].replace('New', np.nan)

In [None]:
#removing '/5' from rates
df['rate'] = df['rate'].apply(lambda x:x.replace('/5' , ''))

In [None]:
df['rate'].unique()

In [None]:
df.columns

In [None]:
df['reviews_list']

# Visualisations

In [None]:
plt.figure(figsize=(17,10))
chains=df['name'].value_counts()[:20]
sns.barplot(x=chains,y=chains.index,palette='deep')
plt.title("Most famous restaurants chains in Bangaluru")
plt.xlabel("Number of outlets")
plt.show()

In [None]:
df['book_table'].value_counts()

In [None]:
plt.figure(figsize = (12, 9))
sns.countplot(df['book_table'])
plt.xlabel('Book table', fontsize=20)
plt.ylabel('Count', fontsize=20)
plt.title("Distribution of book table", fontsize=30)


In [None]:
x=df['book_table'].value_counts()
colors = ['#800080', '#0000A0']

trace=go.Pie(labels=x.index,values=x,textinfo="value",
            marker=dict(colors=colors, 
                           line=dict(color='#001000', width=2)))
layout=go.Layout(title="Table booking",width=600,height=600)
fig=go.Figure(data=[trace],layout=layout)
py.iplot(fig, filename='pie_chart_subplots')

# Most of the Restaurants do not offer table booking

In [None]:
fig = plt.gcf()
fig.set_size_inches(10,10)
sns.countplot(df['online_order'])
plt.xlabel('online orders', fontsize=20)
plt.ylabel('Count', fontsize=20)
plt.title("Distribution of online orders", fontsize=30)

# Most Restaurants offer option for online order and delivery

# Rating Distributions

In [None]:
#how rating are distributed
fig = plt.gcf()
fig.set_size_inches(10,10)

sns.distplot(df['rate'], bins=20)

We can infer from above that most of the ratings are within 3.5 and 4.5

In [None]:
df['rate'].unique()

In [None]:
df['rate'].min()

In [None]:
df['rate'].max()

In [None]:
df['rate']=df['rate'].astype(float)

In [None]:
((df['rate']>=1) & (df['rate']<2)).sum()

In [None]:
((df['rate']>=2) & (df['rate']<3)).sum()

In [None]:
((df['rate']>=3) & (df['rate']<4)).sum()

In [None]:
slices=[((df['rate']>=1) & (df['rate']<2)).sum(),
        ((df['rate']>=2) & (df['rate']<3)).sum(),
        ((df['rate']>=3) & (df['rate']<4)).sum(),
        (df['rate']>=4).sum()
        ]

labels=['1<rate<2','2<rate<3','3<rate<4','>4']
colors = ['#ff3333','#c2c2d6','#6699ff']
plt.pie(slices,colors=colors, labels=labels, autopct='%1.0f%%', pctdistance=.5, labeldistance=1.2,shadow=True)
fig = plt.gcf()
plt.title("Percentage of Restaurants according to their ratings")

fig.set_size_inches(10,10)
plt.show()

In [None]:
#Types of Services

sns.countplot(df['type']).set_xticklabels(sns.countplot(df['type']).get_xticklabels(), rotation=90, ha="right")
fig = plt.gcf()
fig.set_size_inches(12,12)
plt.title('Type of Service')

Here the two main service types are Delivery and Dine-out

In [None]:
from plotly.offline import iplot


In [None]:
trace0 = go.Box(y = df['cost'], name='accepting online orders',
               marker=dict(color='rgb(113, 10 , 100)',)) 
data = [trace0]
layout = go.Layout(title="Box plot of approximate cost", width=800,
                  height=800 ,yaxis=dict(title='Price'))
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# Distribution of charges

In [None]:
plt.figure(figsize=(12, 12))
sns.distplot(df['cost'])
plt.show()

In [None]:
df['dish_liked'][1]

In [None]:
df['dish_liked'][:10]

In [None]:
range(df.shape[0])

In [None]:
import re

df.index = range(df.shape[0])
likes = []
for i in range(df.shape[0]):
    array_split = re.split(',', df['dish_liked'][i])
    for item in array_split:
        likes.append(item)

In [None]:
df.index = range(df.shape[0])

In [None]:
df.index

In [None]:
print("Count of Most liked dishes in Banglore")
favourite_food = pd.Series(likes).value_counts()
favourite_food.head(30)

In [None]:
ax = favourite_food.nlargest(n=20, keep='first').plot(kind='bar', 
         figsize=(18, 10), title='Top 30 Favourite Food Counts')

for i in ax.patches:
    ax.annotate(str(i.get_height()), (i.get_x()*1.005, 
            i.get_height()*1.005))

In [None]:
plt.figure(figsize=(15, 7))
rest = df['rest_type'].value_counts()[:20]
sns.barplot(rest, rest.index)
plt.title("Restaurant Types")
plt.xlabel('Count')

In [None]:
fig = plt.gcf()
fig.set_size_inches(10,10)
rest = df['rest_type'].value_counts()[:20]
sns.barplot(rest, rest.index)
plt.title("Restuarant types")
plt.xlabel('Count')

# Most famous Restaurants



In [None]:
plt.figure(figsize=(15, 7))
chains = df['name'].value_counts()[:20]
sns.barplot(x = chains, y=chains.index, palette='Set1')
plt.title("Most famous restaurant chains in bangaluru", size=20 , pad=20)
plt.xlabel("Number of outlets", size=15)

In [None]:
df.head()

# Convert the online categorical variables into a numeric format

In [None]:
df.online_order[df['online_order']== 'Yes'] = 1
df.online_order[df['online_order']== 'No'] = 0

In [None]:
df.online_order.value_counts()

In [None]:
df.online_order = pd.to_numeric(df.online_order)

# change the string categorical into to a categorical int

In [None]:
df.book_table[df.book_table=='Yes'] = 1
df.book_table[df.book_table=='No'] = 0

In [None]:
df.book_table = pd.to_numeric(df.book_table)

In [None]:
df.book_table.value_counts()

Label encode the categorical variables to make it easier to build algorithm

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()  

In [None]:
df.location = le.fit_transform(df.location) 
df.rest_type = le.fit_transform(df.rest_type)
df.cuisines  = le.fit_transform(df.cuisines)
df.menu_item = le.fit_transform(df.menu_item)

In [None]:
data = df.iloc[:, [2,3,4,5,6,7,9,10,12]]
#data.to_csv('zomato_df.csv')

In [None]:
x = df.iloc[:, [2,3,4,5,6,7,9,10,12]]
x.head()

In [None]:
y = df['rate']
y

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,
                                    random_state=10)

In [None]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import r2_score
y_pred = lr_model.predict(x_test)
r2_score(y_test, y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=650, 
                                random_state=245, 
                                min_samples_leaf=.0001)
rf_model.fit(x_train, y_train)
y_predict = rf_model.predict(x_test)
r2_score(y_test, y_predict)

In [None]:
#Preparing Extra Tree Regression
from sklearn.ensemble import  ExtraTreesRegressor
ET_Model=ExtraTreesRegressor(n_estimators = 120)
ET_Model.fit(x_train,y_train)
y_predict=ET_Model.predict(x_test)


from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

In [None]:
# Installing the CatBoost
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from catboost import CatBoostRegressor

# Building a catboost regressor model
CBR_model = CatBoostRegressor(
    n_estimators = 200,
    loss_function = 'MAE',
    eval_metric = 'RMSE'
    )
CBR_model.fit(x_train, y_train) # Training
y_predict = CBR_model.predict(x_test) # Predicting

In [None]:
r2_score(y_test,y_predict) #Calculating the r2_score

> 