In [None]:
! pip install pgeocode

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from fbprophet import Prophet
import pycountry
import plotly.express as px
from collections import namedtuple
import pandas_profiling
from IPython.display import display
import collections, numpy
import pgeocode

## Importing Data

In [None]:
df_original = pd.read_csv('../input/mumbai-restaurants-trip-advisor-dataset/tripadvisor.csv')
df = pd.read_csv('../input/mumbai-restaurants-trip-advisor-dataset/tripadvisor.csv')

In [None]:
report = pandas_profiling.ProfileReport(df)
display(report)

# Data cleaning


### From the above pandas profiling I come to an understanding that the data set has:
(a) Duplicate (This is in regards to the Names and Addresses together and not only Name by it self)

(b) We do not have many null data columns and will be removing Columns not required..

(c) Cuisines will need to be segrated into the multipal columns.

(d) Will need to pull out the areas/pincode from the adress coloumn as this can help me with map plotting and area wise sampling.

(e) Combining the dataset of (c) and (d) just to see if there is any correlation.

(f) Showing the different dataframes we will work with

## (a) Looking for duplicates

In [None]:
duplicate = df[df.duplicated(subset = ['name', 'address'], keep = False)]
print("Below are the duplicate rows according to Name and Address:")
duplicate

## There are no duplicates in the data set according to name and address together but we have null rows which will be removed in the next cells.

In [None]:
null_rows = df[df.iloc[: , [1,2]].isna().apply(lambda x: all(x), axis=1)]
df = df[~df.iloc[: , [1,2]].isna().apply(lambda x: all(x), axis=1)]
print("Below are the Missing values for the columns name and address:")
null_rows

## (b) Removing the unwanted columns.

In [None]:
df.drop(['price_range_from', 'price_range_to', 'Phone'], axis='columns', inplace=True)

## (c) Transforming cuisines into indivual columns.

In [None]:
dummies = df['cuisines'].str.get_dummies(sep=', ')
df_cuisines = pd.concat([df, dummies], axis=1)
df_cuisines = df_cuisines.drop(['Bar', 'Barbecue', 'Brew Pub', 'Cafe', 'Contemporary', 'Deli', 'Diner', 'Dining bars', 'Fast food', 'Fusion', 'Gastropub', 'Gluten Free Options', 'Grill', 'Healthy', 'Pub', 'Seafood', 'Soups', 'Sports bars', 'Steakhouse', 'Street Food', 'Vegan Options', 'Vegetarian Friendly', 'Wine Bar', 'International'], axis=1)

merge_cuisines = df_cuisines['American'] + df_cuisines['Central American'] + df_cuisines['Cajun & Creole'] + df_cuisines['Southwestern']
df_cuisines['American1'] = merge_cuisines
df_cuisines['American1'] = df_cuisines.American1.replace((4, 3, 2), 1)

merge_cuisines = df_cuisines['Central-Italian'] + df_cuisines['Italian'] + df_cuisines['Northern-Italian'] + df_cuisines['Pizza'] + df_cuisines['Southern-Italian']
df_cuisines['Italian1'] = merge_cuisines
df_cuisines['Italian1'] = df_cuisines.Italian1.replace((3,2), 1)

merge_cuisines = df_cuisines['Japanese'] + df_cuisines['Sushi']
df_cuisines['Japanese1'] = merge_cuisines
df_cuisines['Japanese1'] = df_cuisines.Japanese1.replace(2, 1)

merge_cuisines = df_cuisines['Central European'] + df_cuisines['European']
df_cuisines['European1'] = merge_cuisines
df_cuisines['European1'] = df_cuisines.European1.replace(2, 1)


df_cuisines = df_cuisines.drop(['American', 'Central American', 'Cajun & Creole', 'Southwestern', 'Central-Italian', 'Italian', 'Northern-Italian', 'Pizza', 'Southern-Italian', 'Japanese', 'Sushi', 'Asian', 'Central Asian', 'Central European', 'European'], axis=1)
df_cuisines = df_cuisines.rename(columns = {'American1': 'American', 'Italian1': 'Italian', 'Japanese1': 'Japanese', 'European1': 'European'})

## (d) Extracting pincode from address and converting pincode to lat & long.

In [None]:
df_pincode = df.copy()
df_pincode['pincode']= [i[-2] for i in df.address. str.split(' ').values]

nomi = pgeocode.Nominatim('IN')
lat = []
long = []
area = []

for i in df_pincode.pincode.values:
    lat.append(nomi.query_postal_code(str(i)).latitude)
    long.append(nomi.query_postal_code(str(i)).longitude)
    area.append(nomi.query_postal_code(str(i)).place_name)
    
df_pincode['lat'] = pd.Series(lat)
df_pincode['long'] = pd.Series(long)
df_pincode['area'] = pd.Series(area)

df_pincode = df_pincode.dropna()

df_pincode.area = df_pincode.area.replace(['Liberty Garden, Malad, Malad West Dely, Orlem', 'Andheri H.O, Azad Nagar (Mumbai)', 
                                         'Mumbai G.P.O., Bazargate, Town Hall (Mumbai), M.P.T., Stock Exchange, Tajmahal', 'Marol Naka, Marol Bazar, J.B. Nagar', 
                                         'Santacruz(West)', 'Powai Iit', 'Marine Lines, Central Building, Churchgate', 'Kandivali East, Kandivali East Extn Counter',
                                         'Thane H.O, Thane R.S., Thane Bazar', 'IRLA, Vileparle(West)', 'Netajinagar, Kurla, Kurla North', 'Delisle Road', 
                                         'Santacruz(West), Santacruz Central', 'V J B Udyan', 'Vihar Road, Sakinaka', 'Kandivali RS, Kandivali West, Charkop', 
                                         'Kherwadi, Audit Bhavan, B.N. Bhavan, Bandra(East)', 'Vileparle Railway Station, Vileeparle (East), Hanuman Road', 
                                         'NITIE', 'Aareymilk Colony, Nagari Niwara, S R P F Camp', 'Mandapeshwar', 
                                         'Shivaji Park (Mumbai), Gokhale Road (Mumbai), Bhawani Shankar, S V S Marg, Ranade Road, Bhawani Shankar Rd', 
                                         'Sahakar Bhavan, Rifle Range, Ghatkopar West', 'Sindhi Society, Chembur H.O', 'Rajawadi',
                                         'Nahur, Mulund Dd Road, Mulund West, Nehru Road (Mumbai), S.B. Road',
                                         'Parel, BEST STaff Quarters, Lal Baug, Chamarbaug, Haffkin Institute, Parel Naka, Parel Rly Work Shop',
                                         'Mazgaon, Mazgaon Road, Mazgaon Dock, V K Bhavan, Dockyard Road', 'Andheri Railway Station, H.M.P. School',
                                         'Worli, Worli Naka', 'Sharma Estate, Goregaon East', 'Danda, Khar Delivery, V.P. Road, Khar Colony', 
                                         'Borivali East, Rajendra Nagar (Mumbai), Daulat Nagar (Mumbai), S. K.Nagar, Magthane', 
                                         'New Yogakshema, Nariman Point, Elephanta Caves Po', 'Girgaon, Ambewadi (Mumbai), Charni Road, Chaupati, Madhavbaug, Opera House', 
                                         'Falkland Road, M A Marg, Kamathipura, J.J.Hospital, Mumbai Central H.O', 'Vesava, Madh', 'Dadar Colony, Dadar H.O, Naigaon (Mumbai)',
                                         'Thakurdwar, S. C. Court, Kalbadevi H.O', 'Mahim East, Mahim Bazar, Mori Road, Kapad Bazar, Mahim H.O', 'New Prabhadevi Road, Prabhadevi',
                                         'Wadala, Kidwai Nagar (Mumbai), Wadala Rs', 'Borivali H.O', 'Vashi (Thane), Vashi Sec-26, Turbhe, Sanpada, Turbhe Market, Vashi-VII, K.U.Bazar',
                                         'Malabar Hill', 'Mandvi (Mumbai), Masjid, B.P.Lane, Null Bazar', 'Colaba, Holiday Camp, V.W.T.C., Asvini, Colaba Bazar',
                                         'Airport (Mumbai), Sahar P & T Colony, International Airport, Sahargaon', 'J.M. Road, P.H. Colony, Bhandup Ind. Estate, Bhandup West, Usha Nagar',
                                         'Chakala Midc', 'Mulund East, Mhada Colony', 'Goregaon (Mumbai), Bangur Nagar, Goregaon RS, Motilal Nagar',
                                         'Vadi, Mangrul, Pisavli, Ganeshwadi (Thane), Katemanivali, Dwarli', 'Tardeo, Bharat Nagar (Mumbai), Grant Road, N.S.Patkar Marg, S V Marg',
                                         'Andheri East, Nagardas Road', 'Kharodi, Ins Hamla', 'Sasun Navghar, Kaman, Satiwai, Gokhiware, Pelhar, Valiv, Rajawali, Vasai East IE, Juchandra',
                                         'Mahul Road, Chembur Extension, FCI', 'Malad East, Rani Sati Marg', 'Vakola, Santacruz(East)', 'Kashi, Mira, Ghodbander, Mira Road',
                                         'Chinchpokli, Haines Road, BPC  Jacob Circle, Agripada, Jacob Circle', 'Chinchbunder, Noor Baug, Princess Dock',
                                         'Worli Police Camp, Worli Colony, Century Mill, Worli Sea Face', 'Haji Ali, Tulsiwadi', 'Bhayander West, Rai', 'Sion, Chunabhatti, Raoli Camp, Transit Camp',
                                         'Cumballa Sea Face, Cumballa Hill, Dr Deshmukh Marg, Gowalia Tank', 'Gokhale Road (Thane), Naupada (Thane)', 'Trombay, T.F.Donar, Govandi',
                                         'Nerul Node-II, Darave, Nerul Node-III, Nerul Sec-48', 'Wadala Truck Terminal, C G S Colony, B P T Colony, Antop Hill',
                                         'Bhandup Complex, Mulund Colony', 'Airoli, Airoli', 'Psm Colony, Vikhroli', 'Chembur Rs, Tilak Nagar (Mumbai)', 'Dahisar RS, Ketkipada, Dahisar',
                                         'Tagore Nagar, Kannamwar Nagar', 'R.A.Nagar, Pant Nagar, Best Staff Colony', 'Vidyanagari', 'Dombivali I.A.', 'Wagle I.E.',
                                         'Bassein Road, Vasai Road E, Umela', 'Santacruz P&t Colony, A I Staff Colony', 'Jekegram', 'Belapur Node-- III, Konkan Bhavan, Belapur Node- V',
                                         'Dharavi, Dharavi Road', 'Dombivali, Ramnagar (Thane), Thakurli, Tilaknagar (Thane)', 'High Court Building (Mumbai), Secretariate, Mantralaya (Mumbai)',
                                         'Rajbhavan (Mumbai)', 'Chitalsar Manpada, Sandozbaugh',
                                          ], [
                                         'Malad West', 'Andheri West', 
                                         'Fort', 'Marol', 
                                         'Santacruz West', 'Powai', 'Churchgate', 'Kandivali East', 
                                         'Thane West', 'Vile Parle West', 'Kurla', 'Lower Parel', 
                                         'Santacruz West', 'Byculla East', 'Sakinaka', 'Kandivali West', 
                                         'Bandra East', 'Vile Parle East', 
                                         'Powai', 'Aarey Colony', 'Borivali West', 
                                         'Shivaji Park', 
                                         'Ghatkopar West', 'Chembur East', 'Ghatkopar East', 
                                         'Mulund West', 
                                         'Parel East', 
                                         'Mazgaon', 'Andheri West', 
                                         'Worli', 'Goregaon East', 'Khar West', 
                                         'Borivali East',
                                         'Churchgate', 'Girgaon',
                                         'Mumbai Central', 'Versova / Madh', 'Dadar East', 
                                         'Kalbadevi', 'Mahim West', 'Prabhadevi',
                                         'Wadala West', 'Borivali West', 'Vashi',
                                         'Malabar Hill', 'Masjid', 'Colaba',
                                         'Andheri East', 'Bhandup West',
                                         'Andheri East', 'Mulund East', 'Goregaon West', 
                                         'Dombivli East', 'Grant Road', 
                                         'Andheri East', 'Malad West', 'Vasai East', 
                                         'Chembur', 'Malad East', 'Santacruz East', 'Mira Road East', 
                                         'Byculla East', 'Mazgaon',  
                                         'Worli', 'Mahalakshmi', 'Bhayander West', 'Sion West', 
                                         'Cumballa Hill', 'Thane West', 'Trombay',
                                         'Nerul', 'Wadala East',
                                         'Mulund West', 'Airoli', 'Vikhroli', 'Chembur West', 'Dahisar',
                                         'Vikhroli', 'Ghatkopar East', 'Santacruz East', 'Dombivli East', 'Thane West',
                                         'Vasai East','Santacruz East', 'Thane West', 'CBD Belapur',
                                         'Mahim East', 'Dombivali West', 'Fort',
                                         'Malabar Hill', 'Thane West',
                                         ])

## (e) Combining df (c) & (d).

In [None]:
df_combine = pd.concat([df_pincode, df_cuisines], axis=1)
df_combine = df_combine.replace(0, np.nan)

## (f) Showing all the dataframes we will use.

#### Original Dataframe

In [None]:
df_original.head()

#### Default Dataframe

In [None]:
df.head()

#### Dataframe with Cuisine

In [None]:
df_cuisines.head()

#### Dataframe with Pin Code

In [None]:
df_pincode.head()

#### Dataframe with Cuisine and Pin Code combined

In [None]:
df_combine.head()

## Data set cleaning has been completed

# Start of EDA

## After cleaning the data let understand it more.

(a) The top10 (5 / 4.5 / 4 / 3.5 star) restaurants in Mumbai by number of reviews.

(b) Top 10 highest reviewed restaurant in Mumbai.

(c) The top 10 cuisine in Mumbai.

(d) The top restaurants according to the top 10 cuisine.

(e) Geo plotting restaurants with their pin code.

(f) Top 10 areas with highest count of restaurants.

## (a) Top 10 (5 / 4.5 / 4 / 3.5 star) Restaurants in Mumbai by number of reviews.

In [None]:
rest = df[['rating','name', 'review_count']]
rest_top = pd.concat([rest], ignore_index = True).dropna()
rest_top = rest_top.sort_values(by=['rating', 'review_count'], ascending = False)
rest_5 = rest_top.groupby(['rating']).get_group(5.0)
rest_4half = rest_top.groupby(['rating']).get_group(4.5)
rest_4 = rest_top.groupby(['rating']).get_group(4.0)
rest_3half = rest_top.groupby(['rating']).get_group(3.5)

### Top 10 5 star Restaurants in Mumbai.

In [None]:
fig = px.bar(rest_5[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 5 star Restaurants in Mumbai.')
fig.show()

### Top 10 4.5 star Restaurants in Mumbai.

In [None]:
fig = px.bar(rest_4half[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 4.5 star Restaurants in Mumbai')
fig.show()

### Top 10 4 star Restaurants in Mumbai.

In [None]:
fig = px.bar(rest_4[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 4 star Restaurants in Mumbai')
fig.show()

### Top 10 3.5 star Restaurants in Mumbai.

In [None]:
fig = px.bar(rest_3half[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 3.5 star Restaurants in Mumbai')
fig.show()

## (b) Top 10 highest reviewed restaurant in Mumbai.

In [None]:
rest_top_review = rest_top.sort_values(by=['review_count'], ascending = False)
fig = px.bar(rest_top_review[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Review Count",
                     "name": "Restaurant Name"},
             title = 'Top 10 highest reviewed Restaurants in Mumbai')
fig.show()

## (c) The top 10 cuisine in Mumbai.

In [None]:
df_cuisine_count = df_cuisines.drop(['name','address','cuisines','rating','review_count','excellent_count','very_good_count','average_count','poor_count','terrible_count'], axis=1)
df_cuisine_count = df_cuisine_count.sum().sort_values(ascending = False)
df_cuisine_count = df_cuisine_count.reset_index()
df_cuisine_count.columns = ['Cuisine', 'Count']

In [None]:
fig = px.bar(df_cuisine_count[0:10],
             x='Cuisine',
             y='Count',
             title = 'Top 10 cuisine in Mumbai')
fig.show()

## (d) The top restaurants according to the top 10 cuisine.

In [None]:
rest_cuis = df_cuisines[['rating','name', 'review_count', 'Indian', 'Chinese', 'Italian', 'European', 'Thai', 'American', 'Mexican', 'Mediterranean', 'Middle Eastern', 'Japanese',]]
rest_cuis = pd.concat([rest_cuis], ignore_index = True)
rest_cuis = rest_cuis.sort_values(by=['review_count', 'rating'], ascending = False)
indian = rest_cuis.groupby(['Indian']).get_group(1.0)
chinese = rest_cuis.groupby(['Chinese']).get_group(1.0)
italian = rest_cuis.groupby(['Italian']).get_group(1.0)
european = rest_cuis.groupby(['European']).get_group(1.0)
thai = rest_cuis.groupby(['Thai']).get_group(1.0)
american = rest_cuis.groupby(['American']).get_group(1.0)
mexican = rest_cuis.groupby(['Mexican']).get_group(1.0)
mediterranean = rest_cuis.groupby(['Mediterranean']).get_group(1.0)
middle_eastern = rest_cuis.groupby(['Middle Eastern']).get_group(1.0)
japanese = rest_cuis.groupby(['Japanese']).get_group(1.0)

In [None]:
fig = px.bar(indian[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Indian Restaurants in Mumbai')
fig.show()

* Mahesh Lunch Home has 2 outlets in Mumbai therefore the above chart is showing both the outlets together as both the outlets have a high enough star rating and reviews to be in the top 10 Indian restaurants.

In [None]:
fig = px.bar(chinese[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Chinese Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(italian[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Italian Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(european[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 European Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(thai[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Thai Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(american[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 American Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(mexican[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Mexican Restaurants in Mumbai')
fig.show()

In [None]:
fig = px.bar(mediterranean[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Mediterranean Restaurants in Mumbai')
fig.show()

* Cafe Moshe's has 2 outlets in Mumbai therefore the above chart is showing both the outlets together as both the outlets have a high enough star rating and reviews to be in the top 10 Mediterranean restaurants.

In [None]:
fig = px.bar(middle_eastern[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Middle Eastern Restaurants in Mumbai')
fig.show()

* Persiaan Darbar has 2 outlets in Mumbai therefore the above chart is showing both the outlets together as both the outlets have a high enough star rating and reviews to be in the top 10 Middle Eastern restaurants.

* Restaurant No 1 and 2 have the same name, but this has been checked The number 1 restaurant has 2 outlets called Persian Darbar and the number 2 restaurant has only one restaurant. Restaurant number 1 and 2 are not the same.

In [None]:
fig = px.bar(japanese[0:10],
             x='name',
             y='review_count',
             labels={"review_count": "Highest rating based on review count",
                     "name": "Restaurant Name"},
             title = 'Top 10 Japanese Restaurants in Mumbai')
fig.show()

## (e) Geo plotting restaurants with their pin code.

In [None]:
fig = px.density_mapbox(df_pincode, lat='lat', lon='long', z='rating', hover_name = 'pincode', radius=10,
                        center=dict(lat=19.076090, lon=72.877426), zoom=10,
                        mapbox_style="stamen-terrain", height = 800, width = 800)
fig.show()

## (f) Top 10 areas with highest count of restaurants.

In [None]:
pincode = df_pincode.area.value_counts().reset_index()
pincode.columns = ['Area', 'Count']

In [None]:
fig = px.bar(pincode[0:10],
             x='Area',
             y='Count',
             title = 'Top 10 Areas with highest Restaurants in Mumbai',
             height = 800)
fig.show()

# Thank you for going through the dataset

This dataset has shown alot of insights in the restaurant industry in Mumbai. Please let me know what you think of this.