In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**About Zomato**

Zomato is an Indian multinational restaurant aggregator and food delivery company founded by Pankaj Chaddah and Deepinder Goyal in 2008. Zomato provides information, menus and user-reviews of restaurants as well as food delivery options from partner restaurants in select cities.

src: wikipedia

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
zomato_data = pd.read_csv("/kaggle/input/zomato-bangalore-restaurants/zomato.csv")
zomato_data

In [None]:
zomato_data.info()

In [None]:
zomato_data.isnull().sum()

The featueres that are included in this dataset:

* **url** contains the url of the restaurant in the zomato website

* **address** contains the address of the restaurant in Bengaluru
 
* **name** contains the name of the restaurant
 
* **online_order** whether online ordering is available in the restaurant or not
 
* **book_table** table book option available or not
 
* **rate** contains the overall rating of the restaurant out of 5
 
* **votes** contains total number of rating for the restaurant as of the above mentioned date

* **phone** contains the phone number of the restaurant
 
* **location** contains the neighborhood in which the restaurant is located
 
* **rest_type** restaurant type
 
* **dish_liked** dishes people liked in the restaurant
 
* **cuisines** food styles, separated by comma
 
* **approx_cost(for two people)** contains the approximate cost for meal for two people
 
* **reviews_list** list of tuples containing reviews for the restaurant, each tuple
 
* **menu_item** contains list of menus available in the restaurant
 
* **listed_in(type)** type of meal
 
* **listed_in(city)** contains the neighborhood in which the restaurant is listed

In [None]:
zomato_data["approx_cost"] = zomato_data["approx_cost(for two people)"].astype(str).apply(lambda x : x.replace(',',''))
zomato_data["approx_cost"] = zomato_data["approx_cost"].astype(float)
zomato_data["rate_num"] = zomato_data["rate"].astype(str).apply(lambda x: x.split('/')[0])
while True:
    try:
        zomato_data['rate_num'] = zomato_data['rate_num'].astype(float)
        break
    except ValueError as e1:
        noise_entry = str(e1).split(":")[-1].strip().replace("'", "")
        print(f'Threating noisy entrance on rate: {noise_entry}')
        zomato_data['rate_num'] = zomato_data['rate_num'].apply(lambda x: x.replace(noise_entry, str(np.nan)))
        
zomato_data.drop(['approx_cost(for two people)', 'rate_num'], axis=1, inplace=True)
zomato_data.head()

**Famous cuisines in Zamato**

Let us check what are the most popular cuisines that are being sold in Bangalore.




In [None]:
famous_cuisine = zomato_data["cuisines"].value_counts()[zomato_data["cuisines"].value_counts()>500]
famous_cuisine

As Bangalore locates in India, it is the capital of India's southern Karnataka state, you can see that Indian food would be the most popular cuisines on the platform. The interesting part is the number of  restaurants that are selling North Indian cuisine are far more than south Indian cuisines,here comes the questions , is North Indian cuisine tastier than South Indian cuisine, more welcomed by the people in Bangalore? or there are more people who know how to cook North Indian cuisine ?

In [None]:
plt.figure(figsize=(20,9))
famous_cuisine.plot.barh()

In [None]:
famous_rest = zomato_data["name"].value_counts()[zomato_data["name"].value_counts()>55]
plt.figure(figsize=(15,8))
famous_rest = famous_rest.sort_values(ascending=False)
famous_rest.plot.barh()

In [None]:
zomato_data[zomato_data["name"] == "Cafe Coffee Day"]["cuisines"]

In [None]:
zomato_data[zomato_data["name"] == "Onesta"]["cuisines"]

We can see that Cafe Coffee Day & Onesta have most outlets in Bangalore, both are cafe style and non-Indian cuisine, this dining style might indicate that the population of Bangalore are mostly yound adult and teenagers who are more adaptable to western style cuisine.


I did some research on Bangalore, it is the center of India's high-tech industry, the city is also known for its parks and nightlife.Therefore we can know that there are a lot of young adults are working and living here.

In [None]:
famous_rest_type = zomato_data["rest_type"].value_counts()[zomato_data["rest_type"].value_counts()>100]
plt.figure(figsize=(20,9))
famous_rest_type.plot.barh()

As mentioned earlier Bangalore is the center of India's high-tech industry, the living pace in this state would be very fast, people would spend more time on work. In a result, Quick Bites will be a better choice to solve their daily meal, as it can be consumed fast.

**Percentage of online orders available in Bangalore**

In [None]:
order_mode = zomato_data["online_order"].value_counts()
order_mode

In [None]:
order_mode.plot.pie(autopct="%.1f%%",figsize=(14, 8))

There were more than 50% of the restaurant in Bangalore accepting online order.

**Percentage of table booking available in Bangalore**

In [None]:
table_book = zomato_data["book_table"].value_counts()
table_book

In [None]:
table_book.plot.pie(autopct="%.1f%%",figsize=(14, 8))

There were more than 80% restaurants in Bangalore accepting table booking

**Online order, approximate cost and rating**

In [None]:
rated_rest = zomato_data[zomato_data["rate"].notnull()]
rated_rest['rate']=rated_rest['rate'].apply(lambda x: float(x.split('/')[0]) if len(x)>3 else 0)
rated_rest = rated_rest[["online_order", "approx_cost","rate"]].dropna()
rated_rest

In [None]:
plt.figure(figsize=(20,9))
sns.scatterplot(x="rate",y="approx_cost", hue="online_order",data=rated_rest, alpha = 0.5)

In [None]:
plt.figure(figsize=(20,9))
sns.distplot(rated_rest["approx_cost"])

In [None]:
plt.figure(figsize=(20,9))
sns.boxplot(x="online_order", y="votes", data=zomato_data)

**Sentimental Analysis - reviews**

I am going to do sentimental analysis based on the reviews to identify what would be a good/bad restaurant to the people in Bangalore, started by cleaning the reviews list for each restaurant.

In [None]:
from tqdm import tqdm
import re

all_ratings = []

for name, ratings in tqdm(zip(zomato_data["name"], zomato_data["reviews_list"])):
    ratings = eval(ratings)
    for score, doc in ratings:
        if score:
            score = score.strip("Rated").strip()
            doc = doc.strip("RATED").strip()
            score = float(score)
            all_ratings.append([name, score, doc])
            
reviews_df = pd.DataFrame(all_ratings, columns = ['name', 'rating', 'review'])
reviews_df['review'] = reviews_df['review'].apply(lambda x : re.sub('[^a-zA-Z0-9\s]',"",x))
reviews_df

In [None]:
reviews_df["name"].value_counts()[reviews_df["name"].value_counts()>3000]

Based on value count for the reviews, it shows that restaurant Hammered, Mast Kalandar & Truffles receiving most reviews from customers, what are the special things that left impression to the customer? I am using wordcloud below to see the keywords that were mentioned a lot in the reviews for each restaurants.

**Hammered restaurant review wordcloud**

In [None]:
hammered_rvw = ""
hammered_rvw_list = reviews_df[reviews_df["name"]=="Hammered"]["review"].astype(str)
for review in hammered_rvw_list:
    hammered_rvw += review

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(hammered_rvw)
plt.figure(figsize=(20, 15))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
reviews_df[reviews_df["name"]=="Hammered"]["rating"].mean()

The wordcloud shows that restaurant Hammered mostly having positive words in the reviews like "good", "amazing","great", "perfect", and the mean of the rating is 4.03, which is above average (assume that the average is 2.5). In conclusion, restaurant Hammered should be considered a famous and good restaurant in Bangalore.

In [None]:
from collections import defaultdict

word_freq = defaultdict(int)

for text in hammered_rvw_list:
    for word in text.split():
        word_freq[word] += 1

pd.DataFrame.from_dict(word_freq, orient='index') \
.sort_values(0, ascending=False) \
.rename(columns={0: 'abs_freq'})


**Mast Kalandar restaurant review wordcloud**

In [None]:
mast_kalandar_rvw = ""
mast_kalandar_rvw_list = reviews_df[reviews_df["name"]=="Mast Kalandar"]["review"].astype(str)
for review in mast_kalandar_rvw_list:
    mast_kalandar_rvw += review

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(mast_kalandar_rvw)
plt.figure(figsize=(20, 15))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
reviews_df[reviews_df["name"]=="Mast Kalandar"]["rating"].mean()

The wordcloud shows that restaurant Mast Kalandar mostly having mixed positive and negative words in the reviews like "good", "horrible","worst", "slow", "disappointed" , "pathetic", but the mean of the rating is only 2.38, which is below average (assume that the average is 2.5).

While receiving so many reviews, there are also many reviews with big keywords "good", in my opinion this might show that the restaurant is improving or declining. The one keyword that is also frequently mentioned is "disappointed", people would only feel disappointed when something fell below their expectation, it is either they heard good review about this restaurant or they had good dining experience before in this restaurant. Therefore I assume that restaurant Mast Kalandar receiving rating below average because it is declining.

In conclusion, restaurant Mast Kalandar should be considered a bad restaurant now in Bangalore.

**Truffle restaurant review wordcloud**

In [None]:
truffle_revw = ""
truffle_revw_list = reviews_df[reviews_df["name"] == "Truffles"]["review"].astype(str)
for review  in truffle_revw_list:
    truffle_revw += review

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(truffle_revw)
plt.figure(figsize=(20, 15))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
reviews_df[reviews_df["name"]=="Truffles"]["rating"].mean()

The wordcloud shows that restaurant Truffles mostly having positive words in the reviews like "good", "best","great", "awesome" and the mean of the rating is 4.39, which is above average (assume that the average is 2.5), considered quite good.

As analysed above, there are many young adults staying in this city, and they have preference on western cuisine, so you can see that "burger", "steak", "pasta", "cheese" are the keywords in the reviews.

In conclusion, restaurant Truffles should be considered a famous and good restaurant in Bangalore, and western cuisine are loved by the young adults in Bangalore.

**Comparison of the reviews between high rating restaurants and low rating restaurants**

In [None]:
high_rating_rvw = ""
high_rating_rvw_list = reviews_df[reviews_df["rating"] > 2.5]["review"].astype(str)
for review in high_rating_rvw_list:
    high_rating_rvw += review

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(high_rating_rvw)
plt.figure(figsize=(20, 15))
plt.imshow(wordcloud)
plt.axis("off");

In [None]:
low_rating_rvw = ""
low_rating_rvw_list = reviews_df[reviews_df["rating"] < 2.5]["review"].astype(str)
for review in low_rating_rvw_list:
    low_rating_rvw += review

In [None]:
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='white', 
                      colormap='Pastel1', collocations=False, stopwords = STOPWORDS).generate(low_rating_rvw)
plt.figure(figsize=(20, 15))
plt.imshow(wordcloud)
plt.axis("off");



As compared both of the wordclouds, these are the main keywords that appear: 

        * food
        * place
        * service
        * chicken

we can see that chicken would be the main dish and favourite food in Bangalore, and the people are looking into the food, service, palce and ambience when they are dining.