In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import skewnorm
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/chennai-zomato-restaurants-data/Zomato Chennai Listing 2020.csv")
df.drop(['Zomato URL'],axis=1,inplace=True)
# df.head()

In [None]:
print(len(df[(df['Delivery Rating'] != 'None') & (df['Dining Rating'] == 'None')]))
print(len(df[(df['Delivery Rating'] == 'None') & (df['Dining Rating'] != 'None')]))
print(len(df[(df['Delivery Rating'] == 'None') & (df['Dining Rating'] == 'None')]))
print(len(df[(df['Delivery Rating'] != 'None') & (df['Dining Rating'] != 'None')]))

In [None]:
#Cleaning location by including only the area
def clean_loc(text):
    if len(text.split(",")) == 1:
        val = text
        return val
    else:
        val = text.split(",")[-1].replace(" ","",1)
        return val

df['Location'] = df['Location'].apply(clean_loc)

In [None]:
#Creating a new column 'Cost Category'

In [None]:
df.shape

In [None]:
#converting the string values to list
df['Cuisine'] = df['Cuisine'].apply(lambda x:eval(x))
df['Features'] = df['Features'].apply(lambda x:eval(x))
df['Top Dishes'] = df['Top Dishes'].apply(lambda x:x.replace("Invalid","[]")) #Replacing 'Invalid' values with '[]'
df['Top Dishes'] = df['Top Dishes'].apply(lambda x:eval(x))

In [None]:
df.head()

In [None]:
df['Number Of Features'] = df['Features'].apply(lambda x:len(x))

In [None]:
df['Dining Rating'] = df['Dining Rating'].astype(str)
df['Dining Rating Count'] = df['Dining Rating Count'].astype(str)
df['Delivery Rating'] = df['Delivery Rating'].astype(str)
df['Delivery Rating Count'] = df['Delivery Rating Count'].astype(str)

In [None]:
len(df[df['Delivery Rating'] == 'None'])

In [None]:
# Value 0 indicates the Restaurants that does not offer Delivery or Dining
df['Delivery Rating'] = np.where(((df['Delivery Rating'] == 'None') & (df['Delivery Rating Count'] == 'Does not offer Delivery')), '0', df['Delivery Rating'])
df['Delivery Rating Count'] = np.where(((df['Delivery Rating'] == '0') & (df['Delivery Rating Count'] == 'Does not offer Delivery')), '0', df['Delivery Rating Count'])
df['Dining Rating'] = np.where(((df['Dining Rating'] == 'None') & (df['Dining Rating Count'] == 'Does not offer Dining')), '0', df['Dining Rating'])
df['Dining Rating Count'] = np.where(((df['Dining Rating'] == '0') & (df['Dining Rating Count'] == 'Does not offer Dining')), '0', df['Dining Rating Count'])


In [None]:
len(df[df['Delivery Rating'] == 'None'])

In [None]:
#Value -1 indicates that restaurants does not have enough Delivery or dining reviews
df['Delivery Rating'] = np.where(((df['Delivery Rating'] == 'None') & (df['Delivery Rating Count'] == 'Not enough Delivery Reviews')), '-1', df['Delivery Rating'])
df['Delivery Rating Count'] = np.where(((df['Delivery Rating'] == '-1') & (df['Delivery Rating Count'] == 'Not enough Delivery Reviews')), '-1', df['Delivery Rating Count'])
df['Dining Rating'] = np.where(((df['Dining Rating'] == 'None') & (df['Dining Rating Count'] == 'Not enough Dining Reviews')), '-1', df['Dining Rating'])
df['Dining Rating Count'] = np.where(((df['Dining Rating'] == '-1') & (df['Dining Rating Count'] == 'Not enough Dining Reviews')), '-1', df['Dining Rating Count'])


In [None]:
len(df[df['Dining Rating'] == 'None'])

In [None]:
#Value -2 indicates that restaurants support Home Delivery only and they dont have enough reviews
# df['Delivery Rating'] = np.where(((df['Delivery Rating'] == 'None') & (('Delivery Only' in df['Features'][0]) | ('Home Delivery' in df['Features'][0])), '-2', df['Delivery Rating'])
df['Delivery Rating'] = df['Delivery Rating'].replace("None",'-2')
df['Delivery Rating Count'] = df['Delivery Rating Count'].replace("None",'-2')
df['Dining Rating'] = df['Dining Rating'].replace("None",'-2')
df['Dining Rating Count'] = df['Dining Rating Count'].replace("None",'-2')

In [None]:
len(df[df['Dining Rating'] == 'None'])

In [None]:
df['Dining Rating'] = df['Dining Rating'].apply(pd.to_numeric)
df['Dining Rating Count'] = df['Dining Rating Count'].apply(pd.to_numeric)
df['Delivery Rating'] = df['Delivery Rating'].apply(pd.to_numeric)
df['Delivery Rating Count'] = df['Delivery Rating Count'].apply(pd.to_numeric)

In [None]:
print(df['Dining Rating'].min(),df['Dining Rating'].mean(),df['Dining Rating'].max())
print(df['Delivery Rating'].min(),df['Delivery Rating'].mean(),df['Delivery Rating'].max())

# ****Creating a dataset that have both dining and delivery reviews

In [None]:
#Creating a Dataframe which contains values that are greater than 0 in numeric columns
df_clean = df[(~df['Delivery Rating'].isin([0,-1,-2])) & (~df['Dining Rating'].isin([0,-1,-2])) & (~df['Delivery Rating Count'].isin([0,-1,-2])) & (~df['Dining Rating Count'].isin([0,-1,-2]))]
# df_not_none = df_not_none[['Delivery Rating','Dining Rating','Delivery Rating Count','Dining Rating Count']]
df_clean.head()

In [None]:
len(df_clean)

In [None]:
#Restaurants that have Dining rating greater than 4.5
top_dine = df_clean[(df_clean['Dining Rating'] > 4.5)]
len(top_dine)
#Grouping the Restaurants according to the Location and their Rating
vis = top_dine.groupby(['Location','Dining Rating']).size().reset_index()
#Listing only those Locations that have multiple restaurants with rating > 4.5
data = vis[vis[0] >= 2]
data
# sns.countplot(x=0,hue='Location',data=data)

In [None]:
#Bar Plot that shows which location has multiple restaurants along with their ratings
plt.figure(figsize=(8,5))
sns.set(style="whitegrid")
sns.barplot(x = 'Location',y = 0 ,hue = 'Dining Rating',data = data,palette='muted')
plt.xticks(rotation = 90)
plt.ylabel("Count")
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.legend(title="Ratings",loc=1, fontsize='small', fancybox=True)

In [None]:
#Gathering data for top cuisines,dishes and features for the entire dataset

def make_dict(column,dataframe):
    data_dict = {}
    for all_val in dataframe[column]:
        for val in all_val:
            if val.strip(" ") not in data_dict:
                data_dict[val.strip(" ")] = 1
            else:
                data_dict[val.strip(" ")] += 1
                
    data_dict = dict(sorted(data_dict.items(),key = lambda k:k[1],reverse=True))
#     df = pd.DataFrame.from_dict(data_dict,orient='index',columns=['Count'])
    df = pd.DataFrame.from_dict(data_dict,orient='index',columns=['Count']).reset_index()
    
    return data_dict,df

top_dishes,top_dishes_df = make_dict('Top Dishes',df)
top_cuisines,top_cuisines_df = make_dict('Cuisine',df)
top_features,top_features_df = make_dict('Features',df)
# print(dict(itertools.islice(top_dishes.items(), 20)))
# print(top_dishes_df.head())

top_dishes_df.head()

In [None]:
print(f"Mininmum Price for meal {df_clean['Price for 2'].min()}")
print(f"Average Price for meal {df_clean['Price for 2'].mean()}")
print(f"Maximum Price for meal {df_clean['Price for 2'].max()}")

**IF WE FOCUS ON THE DATASET THAT HAS BOTH DINING AND DELIVERY REVIEWS, WE FOUND THAT THE NUMBER OF REVIEWS IN THE ORIGINAL DATASET ARE SLIGHTLY HIGHER**

In [None]:
#Number of people who dined and ordered from restaurants (here people who ordered or dined multiple times is also included)
#For the entire Dataset
num_dined = df['Dining Rating Count'].sum()
num_delivery = df['Delivery Rating Count'].sum()
print(f"Number of people Dined {num_dined}")
print(f"Number of people ordered {num_delivery}")

In [None]:
#Number of people who dined and ordered from restaurants (here people who ordered or dined multiple times is also included)
num_dined = df_clean['Dining Rating Count'].sum()
num_delivery = df_clean['Delivery Rating Count'].sum()
print(f"Number of people Dined {num_dined}")
print(f"Number of people ordered {num_delivery}")

In [None]:
#Gathering data for top cuisines,dishes and features in the trimmed data set
top_dishes,top_dishes_df = make_dict('Top Dishes',df_clean)
top_cuisines,top_cuisines_df = make_dict('Cuisine',df_clean)
top_features,top_features_df = make_dict('Features',df_clean)
top_dishes_df = top_dishes_df.head(10)
top_cuisines_df = top_cuisines_df.head(10)
top_features_df = top_features_df.head(10)
# top_dishes_df


In [None]:
fig, axes = plt.subplots(1,3 , figsize=(18, 8))
sns.set_palette(palette='husl',n_colors=10)
fig.suptitle("Top 10 Dishes,Cuisines and Featuresin Chennai's Restaurant")
for ax in axes:
    ax.tick_params(labelrotation=90)
sns.barplot(ax=axes[0],x=top_dishes_df['index'],y=top_dishes_df['Count'])
axes[0].set_xlabel("Dishes")
sns.barplot(ax=axes[1],x=top_cuisines_df['index'],y=top_cuisines_df['Count'])
axes[1].set_xlabel("Cuisines")
sns.barplot(ax=axes[2],x=top_features_df['index'],y=top_features_df['Count'])
axes[2].set_xlabel("Features")
# plt.title("")


In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = top_dishes_df['index']
plt.pie(x=top_dishes_df['Count'], autopct="%.1f%%", labels=labels, pctdistance=0.5)
plt.title("Delivery Tips by type", fontsize=14);

In [None]:
#Lets see how many restaurants offer food that costs lower than average 
lower_than_avg = len(df_clean[df_clean['Price for 2'] < df_clean['Price for 2'].mean()])
higher_than_avg = len(df_clean[df_clean['Price for 2'] > df_clean['Price for 2'].mean()])
print(f"{lower_than_avg} restaurants offer food that costs less than {df_clean['Price for 2'].mean()}")
print(f"{higher_than_avg} restaurants offer food that costs more than {df_clean['Price for 2'].mean()}")

In [None]:
#Lets see how many restaurants has ratingg above 4.5 and their cost for 2 person is below average
df_clean[(df_clean['Price for 2'] < df_clean['Price for 2'].mean()) & (df_clean['Dining Rating'] > 4.5)]

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 8))
fig.suptitle('Showing Various Distributions')
# axes[0].set_title()
sns.scatterplot(ax=axes[0,0],x="Delivery Rating",y="Delivery Rating Count",data=df_clean)
sns.scatterplot(ax=axes[0,1],x="Dining Rating",y="Dining Rating Count",data=df_clean)
sns.distplot(df_clean['Delivery Rating'],fit=skewnorm,kde=False,ax=axes[1,0])
sns.distplot(df_clean['Dining Rating'],fit=skewnorm,kde=False,ax=axes[1,1]) 

In [None]:
sns.distplot(df_clean['Price for 2']) 

In [None]:
grouped_price = df_clean.groupby(['Price for 2']).size().reset_index()
sns.barplot(x=grouped_price['Price for 2'],y=grouped_price[0])
plt.xticks(rotation=90) 
plt.title("")

In [None]:
sns.heatmap(df_clean.corr(), annot = True, fmt='.1g')

In [None]:
#Lets separate the price between cheap, moderate and expensive
def categorize_prices(val):
#     for val in df[column]:
        if val <= 350:
            return 'Cheap'
        elif (val > 350) & (val <= 550):
            return 'Average'
        else:
            return 'Expensive'

df_clean['Price Category'] = df_clean['Price for 2'].apply(categorize_prices)

In [None]:
df_clean.groupby(['Price Category']).sum()

In [None]:
price_cat_dining = df_clean.groupby(['Price Category','Dining Rating']).size().reset_index()
price_cat_dining = price_cat_dining[price_cat_dining['Dining Rating'] > 3.0]
plt.figure(figsize=(12,8))
sns.barplot(x=price_cat_dining['Dining Rating'],y=price_cat_dining[0],hue=price_cat_dining['Price Category'],palette='husl')
plt.ylabel("Count")

In [None]:
price_cat_dining.head()

In [None]:
price_cat_delivery = df_clean.groupby(['Price Category','Delivery Rating']).size().reset_index()
price_cat_delivery = price_cat_delivery[price_cat_delivery['Delivery Rating'] > 3.0]
plt.figure(figsize=(12,8))
sns.barplot(x=price_cat_delivery['Delivery Rating'],y=price_cat_delivery[0],hue=price_cat_delivery['Price Category'],palette='husl')
plt.ylabel("Count")

In [None]:
group_name = df_clean.groupby(['Name of Restaurant']).size().reset_index()
group_name = group_name[group_name[0] > 10]
group_name.head()

In [None]:
group_name = df_clean.groupby(['Name of Restaurant','Price Category','Delivery Rating']).size().reset_index()
group_name[group_name[0]>10]

In [None]:
#Lets create a column that is the sum of both dining rating count and delivery rating counts
df_clean['Total Rating Count'] = df_clean['Dining Rating Count'] + df_clean['Delivery Rating Count']

In [None]:
df_clean.head()

In [None]:
#Lets see which restaurant attracts most customers  
top_restaurants = df_clean.groupby(['Name of Restaurant'])['Total Rating Count'].agg(['sum','count']).reset_index()
top_restaurants.sort_values(by='sum',ascending=False).head()
# top_restaurants.head()

In [None]:
#Which are the areas where the most orders are received
top_restaurants_location = df_clean.groupby(['Location'])['Total Rating Count'].agg(['sum','count']).reset_index()
# top_restaurants[top_restaurants['count'] > 10]
top_restaurants_location.sort_values(by='count',ascending=False).head()

In [None]:
# top_dine.boxplot(by='Location',column=['Dining Rating Count'],grid=False)
# plt.xticks(rotation = 90)
# plt.ylabel("Count")
# plt.title("")