### Importing required libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')

In [None]:
df.head()

In [None]:
## data frame columns
df.columns

In [None]:
## Data types of the data frame
df.dtypes

In [None]:
## Shape of data frame
df.shape

### Data preprocessing

In [None]:
## Checking the null values
df.isnull().sum()

In [None]:
## collecting the null values from the data frame
feature_na = [feature for feature in df.columns if df[feature].isnull().sum() > 0] 
feature_na

In [None]:
## checking the % missing values in the null features
for feature in feature_na:
    print(' {} has {} missing values'.format(feature,np.round((df[feature].isnull().sum()/len(df))*100, 3)))

In [None]:
## Dealing with null values 
#  in rate column
df['rate'].unique()

In [None]:
## the rate is kind of messy as we can see
## it simply means 4 out of 5 i.e, 4/5
## removing the na and null values
df.dropna(axis = 'index', subset = ['rate'], inplace = True)
df.shape

In [None]:
#  we need to split the values
#  defining the funtion to split the ratings
def split(x):
    return x.split('/')[0]

In [None]:
df['rate'] = df['rate'].apply(split)

In [None]:
## Checking the data frame changes
df.head()

In [None]:
## Checking the unique values
df['rate'].unique()

In [None]:
## Replacing NEW with 0
df.replace('NEW',0,inplace = True)

In [None]:
## Replacing - with 0
df.replace('-',0,inplace = True)

In [None]:
## Checking the data type of Rate column
df['rate'].dtype

In [None]:
## Changing Rate to float
df['rate'] = df['rate'].astype(float)
df['rate'].dtype

### Indepth ananlysis of Restaunts

In [None]:
## Grouping by on basis of ratings
df_rate = df.groupby('name')['rate'].mean().to_frame().reset_index()
df_rate.columns = ['restaurant', 'avg_rating']
df_rate.head(20)

In [None]:
## Checking the distribution of rating column 
sns.distplot(df_rate['avg_rating']) 
# getting the peak at 0 because we imputed 0 on NA value

In [None]:
## Top Restaurant in Bangalore
chains = df['name'].value_counts()[0:20]
sns.barplot(x = chains, y = chains.index)
plt.title('Most Famous Restaurants chains in Bangalore')
plt.xlabel('Number of Outlets')

In [None]:
## How many Restaurants do not accept online orders
#  Checking the Restaurants that accepts online order
online_order_count = df['online_order'].value_counts()
online_order_count

In [None]:
online_order_labels = ['accepted', 'Not accepted']

In [None]:
 import plotly.express as px

In [None]:
px.pie(df, values = online_order_count , labels = online_order_labels, title = 'Pie Chart')

In [None]:
## Restaurants that provide table and that does not provide table
#  Book table is the parameter that provide us the info
book_table_count = df['book_table'].value_counts()
book_table_count

In [None]:
book_table_label = ['Available','Not available']

In [None]:
px.pie(df, values = book_table_count , labels = book_table_label, title = 'Pie Chart')

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
trace = go.Pie(labels = book_table_label, values = book_table_count, hoverinfo = 'label+percent', textinfo = 'value')
iplot([trace])

In [None]:
## Types of Restraunts 
#  It is in the rest_type 
#  But first checking the null values

df['rest_type'].isna().sum()

#  151 null values we need to deal with it

In [None]:
column_names = ['rest_type']
df2 = pd.DataFrame(columns = column_names)

In [None]:
df2['rest_type'] = df['rest_type']

In [None]:
## Dropping the null values
df2.dropna(inplace=True)

In [None]:
df2['rest_type'].isna().sum()

In [None]:
trace1 = go.Bar(x = df2['rest_type'].value_counts().nlargest(20).index,
                y = df2['rest_type'].value_counts().nlargest(20).index
               )

In [None]:
iplot([trace1])

#### Highest rated Restraunts

In [None]:
df.groupby('name')['votes'].sum().nlargest(20).plot.bar()

In [None]:
## Interactive bar garph
trace2 = go.Bar(x = df.groupby('name')['votes'].sum().nlargest(20).index,
               y = df.groupby('name')['votes'].sum().nlargest(20))
iplot([trace2])

### Total Restaurant at Different location of Banglore

In [None]:
restaurant = []
location   = []
for key,location_df in df.groupby('location'):
    location.append(key)
    restaurant.append(len(location_df['name'].unique()))

In [None]:
df_total = pd.DataFrame(zip(location,restaurant))
df_total.columns = ['location','restaurant']
df_total.head()

In [None]:
df_total.sort_values(by = 'restaurant').tail(10).plot.bar()

In [None]:
## Interactive bar plot
## Interactive bar garph
trace3 = go.Bar(x = df_total['restaurant'].value_counts().nlargest(10).index,
                y = df_total['restaurant'].value_counts().nlargest(10)
               )
iplot([trace3])

#### Total varity of Restaurant in Banglore

In [None]:
## we have different types of cusines
cuisines = df['cuisines'].value_counts()[:10]
cuisines

In [None]:
trace6 = go.Bar( x = df['cuisines'].value_counts()[:10].index,
                y = df['cuisines'].value_counts()[:10]
               )
iplot([trace6])

#### Approx cost of 2 people 

In [None]:
## Checking the columns in the dataframe

df.columns
## approx_cost(for two people) have cost of approx 2 people

In [None]:
## Checking the null values
df['approx_cost(for two people)'].isna().sum()

In [None]:
## Dropping the null values
df.dropna(axis = 'index', subset = ['approx_cost(for two people)'], inplace = True)

In [None]:
## checking the null values
df['approx_cost(for two people)'].isna().sum()

In [None]:
## Cheking the data type of the variable
df['approx_cost(for two people)'].dtype
#  it is string but it should be numeric 

In [None]:
## checking the unique values
df['approx_cost(for two people)'].unique()

#  as we can observe that we have , in between of the number
#  we need to remove it

In [None]:
## Creating a lambda function to remove the ,
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].apply(lambda x: x.replace(',',''))

In [None]:
## checking the unique values
df['approx_cost(for two people)'].unique()

In [None]:
## changing the string to float
## checking the unique values
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(int)

In [None]:
## Checking the datatype
## checking the unique values
df['approx_cost(for two people)'].dtype

In [None]:
## Plotting the distribution curve
sns.distplot(df['approx_cost(for two people)'])

# right scewed data which means the cost of 2 people lies between 0-1000
# which suggest restaurats in banglore is afforadable

#### Approx cost of 2 people vs rating

In [None]:
sns.scatterplot(x = 'rate', y ='approx_cost(for two people)',hue = 'online_order',data =df )

#### Relation ship between votes of restaurants accepting and no accepting online order

In [None]:
sns.boxplot(x = 'online_order', y = 'votes',data =df)

In [None]:
## Plotly 
px.box(df,x = 'online_order', y = 'votes')

#  Online order restaurants have more number of votes

#### Difference b/w price of restaurants and accepting and non accepting online orders

In [None]:
px.box(df,x = 'online_order', y = 'approx_cost(for two people)')

##### Most luxurious restaurants of Bangalore

In [None]:
## Checking minimum cost of 2 people
df['approx_cost(for two people)'].min()

In [None]:
## Checking maximum cost of 2 people
df['approx_cost(for two people)'].max()

#  It becomes the filter for checking the most luxurious restaurants

In [None]:
df[df['approx_cost(for two people)']>= 5000]['name'] 

##### Top 10 most expensive restaurant with approx cost of 2 people

In [None]:
## creating copy of original data
data_copy = df.copy()

In [None]:
data_copy.set_index('name', inplace = True)

In [None]:
data_copy.head()

In [None]:
data_copy['approx_cost(for two people)'].nlargest(10).plot.bar()

In [None]:
## 10 Cheap restaurants
data_copy['approx_cost(for two people)'].nsmallest(10).plot.bar()

In [None]:
## Budget hotel i.e, restaurants below 500
df[df['approx_cost(for two people)']<= 500]['name'] 

#### Restaurants with rating greater than 4 and are afforadable

In [None]:
df[(df['rate']>4) & (df['approx_cost(for two people)'] <= 500)].shape

In [None]:
len(df[(df['rate']>4) & (df['approx_cost(for two people)'] <= 500)]['name'].unique())

In [None]:
## Total various affordable hotel at all the location of Bangalore

In [None]:
df_new = df[(df['rate']>4) & (df['approx_cost(for two people)'] <= 500)]

In [None]:
df_new.head()

In [None]:
location = []
total    = []

for loc,location_df in df_new.groupby('location'):
    location.append(loc)
    total.append(len(location_df['name'].unique()))

In [None]:
location_df = pd.DataFrame(zip(location, total))
location_df.head()

In [None]:
location_df.columns = ['location', 'restaurant']
location_df.head()

#### Finding best budget Restaurants in any location

In [None]:
## defining a function
def return_budget(location, restaurant):
    budget = df[(df['approx_cost(for two people)']<= 400) & (df['location'] == location) & (df['rate']>4) & (df['rest_type'] == restaurant)]
    return (budget['name'].unique())

In [None]:
## best restaurants in BTM
return_budget('BTM','Quick Bites')

In [None]:
#### Which are the foodie areas
restaurant_location = df['location'].value_counts()[0:20]
sns.barplot(restaurant_location,restaurant_location.index)

## Geographical Analysis

#### Latitudes and Longitudes for each location in Banglore

In [None]:
locations = pd.DataFrame({'Name':df['location'].unique()})
locations.head()

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent = 'app')

In [None]:
lat_lon = []
for location in locations['Name']:
    location = geolocator.geocode(location)
    if location is None:
        lat_lon.append(np.nan)
    else:
        geo = (location.latitude, location.longitude)
        lat_lon.append(geo)

In [None]:
locations['geo_loc'] = lat_lon

In [None]:
locations.head()

In [None]:
locations.shape

In [None]:
Rest_locations = pd.DataFrame(df['location'].value_counts().reset_index())
Rest_locations.head()

In [None]:
Rest_locations.columns = ['Name','Count']
Rest_locations.head()

In [None]:
## Combining both the data frames
Rest_locations = Rest_locations.merge(locations , on = 'Name', how = 'left').dropna()
Rest_locations.head()

In [None]:
## Changing geo_loc to latitue and longatude
lat,lon = zip(*np.array(Rest_locations['geo_loc']))

In [None]:
type(lat)

In [None]:
Rest_locations['lat'] = lat
Rest_locations['lon'] = lon

In [None]:
Rest_locations.head()

In [None]:
Rest_locations.drop('geo_loc',axis = 1, inplace = True)

In [None]:
Rest_locations.head()

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
def generatebasemap(default_location = [12.97,77.59], default_zoom_start = 12):
    basemap = folium.Map(location = default_location,zoom_start = default_zoom_start)
    return basemap

In [None]:
basemap = generatebasemap()

In [None]:
basemap

#### HeatMap of Restaurants

In [None]:
HeatMap(Rest_locations[['lat','lon','Count']].values.tolist(),zoom = 20, radius = 15).add_to(basemap)

In [None]:
basemap

#### Heat map of North Indian restaurants

In [None]:
df.head()

In [None]:
north_cuisines = df[df['cuisines'] == 'North Indian']
north_cuisines.head()

In [None]:
north_india = north_cuisines.groupby(['location'],as_index = False)['url'].agg('count')

In [None]:
north_india.head()

In [None]:
north_india.columns = ['Name','count']

In [None]:
north_india.head()

In [None]:
## North indian
north_india = north_india.merge(locations,on='Name',how = 'left').dropna()
north_india.head()

In [None]:
north_india['lat'],north_india['lon'] = zip(*north_india['geo_loc'].values)

In [None]:
north_india.head()

In [None]:
north_india.drop('geo_loc',axis = 1, inplace = True)

In [None]:
north_india.head()

In [None]:
basemap = generatebasemap()
HeatMap(north_india[['lat','lon','count']].values.tolist(),zoom = 20, radius = 15).add_to(basemap)
basemap

#### Most casual dining Restaurant chains

In [None]:
df_casual = df.groupby(['rest_type','name']).agg('count')

In [None]:
df_casual.sort_values(['url'],ascending = False).groupby(['rest_type'], as_index = False).apply(lambda x: x.sort_values(by = 'url',ascending = False))

In [None]:
df_casual.sort_values(['url'],ascending = False).groupby(['rest_type'], as_index = False).apply(lambda x: x.sort_values(by = 'url',ascending = False))['url'].reset_index()

In [None]:
df_casual_dataset = df_casual.sort_values(['url'],ascending = False).groupby(['rest_type'], as_index = False).apply(lambda x: x.sort_values(by = 'url',ascending = False))['url'].reset_index().rename(columns = {'url':'count'})

In [None]:
df_casual_dataset

In [None]:
casual_dining = df_casual_dataset[df_casual_dataset['rest_type'] == 'Casual Dining']

In [None]:
casual_dining.head()