In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Reading CSV

In [None]:
df=pd.read_csv('zomato.csv')

In [None]:
df.head()

In [None]:
df.shape


In [None]:
df.columns

# Deleting unnecessary columns


In [None]:
df=df.drop(['url', 'address','phone','dish_liked', 'reviews_list', 'menu_item'],axis=1)
df.head()

In [None]:
df.info()

# Dropping Duplicates

In [None]:
df.drop_duplicates(inplace=True)
df.shape

# Cleaning Rate Column

In [None]:
df['rate'].unique()

# Removing "NEW","-" and "/5" from Rate Column

In [None]:
def handlerate(value):
    if(value=="NEW" or value=='-'):
        return np.nan
    else:
        value=str(value).split('/')
        value=value[0]
        return float(value)
df['rate']=df['rate'].apply(handlerate)
        

In [None]:
df['rate'].head()

# Filling Null Values in Rate Column with Mean

In [None]:
df.rate.isnull().sum()

In [None]:
df['rate'].fillna(df['rate'].mean(),inplace=True)
df['rate'].isnull().sum()

In [None]:
df.info()

# Dropping Null Values

In [None]:
df.dropna(inplace=True)
df.head()

In [None]:
df.rename(columns={'approx_cost(for two people)':'Cost2plates','listed_in(type)':'Type'},inplace=True)
df.head()

In [None]:
df['location'].unique()

In [None]:
df['listed_in(city)'].unique()

# Listed in(city) and location both are there lets keep only one

In [None]:
df=df.drop(['listed_in(city)'],axis=1)

In [None]:
df['Cost2plates'].unique()

In [None]:
df.head()

# Removing , from Cost2Plates Column

In [None]:
def handlecomma(value):
    value = str(value)
    if ',' in value:
        value = value.replace(',', '')
    return float(value)

df['Cost2plates'] = df['Cost2plates'].apply(handlecomma)
unique_values = df['Cost2plates'].unique()
print(unique_values)

In [None]:
df.head()

# Clean Rest Type Column

In [None]:
df['rest_type'].value_counts()

In [None]:
rest_types=df['rest_type'].value_counts(ascending=False)
rest_types

In [None]:
rest_types_lessthan1000=rest_types[rest_types<1000]
rest_types_lessthan1000

# Making Rest Types less than 1000 in frequency as others

In [None]:
def handle_rest_type(value):
    if(value in rest_types_lessthan1000):
        return 'others'
    else:
        return value
df['rest_type']=df['rest_type'].apply(handle_rest_type)
df['rest_type'].value_counts()

# Cleaning Location Column

In [None]:
df['location'].value_counts()

In [None]:
location=df['location'].value_counts(ascending =False)
location_lessthan300 =location[location<300]

def handle_location(value):
    if(value in location_lessthan300):
        return 'others'
    else:
        return value
df['location']=df['location'].apply(handle_location)

In [None]:
df['location'].value_counts()

In [None]:
cuisines=df['cuisines'].value_counts(ascending =False)
cuisines_lessthan100 =cuisines[cuisines<100]

def handle_cuisines(value):
    if(value in cuisines_lessthan100):
        return 'others'
    else:
        return value
df['cuisines']=df['cuisines'].apply(handle_cuisines)
df['cuisines'].value_counts()


# Data is Clean Let's visualize

# visualizing online order vs rate

In [None]:
plt.figure(figsize=(6,6))
sns.boxplot(x='online_order',y='rate',data=df)

# Visualizing Book Table vs Rate

In [None]:
plt.figure(figsize=(6,6))
sns.boxplot(x='book_table',y='rate',data=df)

# Count Plot of Various Locations

In [None]:
location_counts = df['location'].value_counts()

plt.figure(figsize=(16, 10))
bars = plt.bar(location_counts.index, location_counts.values, color=plt.cm.plasma(range(len(location_counts))))

# Adding grid lines
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Changing background color to white
plt.gca().set_facecolor('white')

plt.xlabel('Location', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Count of Various Locations', fontsize=16)
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)

# Adding data labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 1), va='bottom', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

# Visualizing Online Order

In [None]:
online_order_counts = df['online_order'].value_counts()

plt.figure(figsize=(8, 6))
bars = plt.bar(online_order_counts.index, online_order_counts.values, color=['skyblue', 'orange'])

plt.xlabel('Online Order', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Count of Online vs. Offline Orders', fontsize=16)

# Adding data labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 1), va='bottom', ha='center', fontsize=12)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

# Visualizing Online Order Facility,Location Wise

In [None]:
pivot_table = pd.pivot_table(df, index='location', columns='online_order', aggfunc='size', fill_value=0)

# Plotting the pivot table
pivot_table.plot(kind='bar', stacked=True, figsize=(12, 8))

plt.xlabel('Location', fontsize=14)
plt.ylabel('Number of Orders', fontsize=14)
plt.title('Online Order Facility Availability Location-wise', fontsize=16)

plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

plt.legend(title='Online Order', fontsize=12)

plt.tight_layout()
plt.show()

# Visualizing Book Table Facility ,Location Wise

In [None]:
pivot_table = pd.pivot_table(df, index='location', columns='book_table', aggfunc='size', fill_value=0)

# Plotting the pivot table
pivot_table.plot(kind='bar', stacked=True, figsize=(12, 8))

plt.xlabel('Location', fontsize=14)
plt.ylabel('Number of Book Table Facilities', fontsize=14)
plt.title('Online Book Table Facility Availability Location-wise', fontsize=16)

plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

plt.legend(title='Online Book Table Facility', fontsize=12)

plt.tight_layout()
plt.show()

# Visualizing Types of Restaurents vs Rate

In [None]:
plt.figure(figsize=(14,8))
sns.boxplot(x='Type',y='rate',data=df,palette='inferno')

# No. of Votes,Location Wise

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(data=df, x='location', y='votes', ci=None)

plt.xlabel('Location', fontsize=14)
plt.ylabel('Number of Votes', fontsize=14)
plt.title('Number of Votes vs. Location', fontsize=16)

plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

# Top Cuisines

In [None]:
top_cuisines = df.groupby('cuisines')['votes'].sum().nlargest(10).index.tolist()

# Filter the DataFrame to include only the top cuisines
df_top_cuisines = df[df['cuisines'].isin(top_cuisines)]

plt.figure(figsize=(12, 8))
sns.barplot(data=df_top_cuisines, x='cuisines', y='votes', ci=None)

plt.xlabel('Cuisine', fontsize=14)
plt.ylabel('Number of Votes', fontsize=14)
plt.title('Top Cuisine vs. Number of Votes', fontsize=16)

plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()