In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [3]:
# Step 2: Load the Data
# Load the dataset
url = 'https://raw.githubusercontent.com/prm2711/python-recommender/master/zomato.csv'
df = pd.read_csv(url)


In [4]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get the basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


   Restaurant ID         Restaurant Name                          Cuisines  \
0        6317637        Le Petit Souffle        French, Japanese, Desserts   
1        6304287        Izakaya Kikufuji                          Japanese   
2        6300002  Heat - Edsa Shangri-La  Seafood, Asian, Filipino, Indian   
3        6318506                    Ooma                   Japanese, Sushi   
4        6314302             Sambo Kojin                  Japanese, Korean   

   Average Cost for two          Currency Has Table booking  \
0                  1100  Botswana Pula(P)               Yes   
1                  1200  Botswana Pula(P)               Yes   
2                  4000  Botswana Pula(P)               Yes   
3                  1500  Botswana Pula(P)                No   
4                  1500  Botswana Pula(P)               Yes   

  Has Online delivery Is delivering now Switch to order menu  Price range  \
0                  No                No                   No            3  

In [5]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# For simplicity, we'll drop rows with missing values for this example
df.dropna(inplace=True)


Restaurant ID           0
Restaurant Name         0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


In [6]:
# Step 5: Data Visualization
# Distribution of ratings
sns.histplot(df['rating'], kde=True, bins=20)
plt.title('Distribution of Restaurant Ratings')
plt.show()

# Number of restaurants by location
sns.countplot(y='city', data=df, order=df['city'].value_counts().index)
plt.title('Number of Restaurants by City')
plt.show()

# Average cost for two by city
sns.boxplot(x='city', y='average_cost_for_two', data=df)
plt.title('Average Cost for Two by City')
plt.xticks(rotation=90)
plt.show()

# Distribution of restaurant types
sns.countplot(y='restaurant_type', data=df, order=df['restaurant_type'].value_counts().index)
plt.title('Distribution of Restaurant Types')
plt.show()


KeyError: 'rating'

In [7]:
# Step 6: Univariate Analysis
# Distribution of restaurant ratings
sns.histplot(df['rating'], kde=True, bins=20)
plt.title('Restaurant Ratings Distribution')
plt.show()

# Distribution of average cost for two
sns.histplot(df['average_cost_for_two'], kde=True, bins=30)
plt.title('Average Cost for Two Distribution')
plt.show()


KeyError: 'rating'

In [8]:
# Step 7: Bivariate Analysis
# Rating vs Average Cost for Two
sns.scatterplot(x='average_cost_for_two', y='rating', data=df)
plt.title('Average Cost for Two vs Rating')
plt.show()

# Rating vs Restaurant Type
sns.boxplot(x='restaurant_type', y='rating', data=df)
plt.title('Rating by Restaurant Type')
plt.xticks(rotation=90)
plt.show()


ValueError: Could not interpret value `average_cost_for_two` for parameter `x`

In [None]:
# Step 8: Multivariate Analysis
# Pair plot for numerical features
sns.pairplot(df[['rating', 'average_cost_for_two']])
plt.show()


In [None]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in ratings
sns.boxplot(x=df['rating'])
plt.title('Boxplot of Ratings')
plt.show()

# Removing outliers from ratings
Q1 = df['rating'].quantile(0.25)
Q3 = df['rating'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['rating'] < (Q1 - 1.5 * IQR)) | (df['rating'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in average cost for two
sns.boxplot(x=df['average_cost_for_two'])
plt.title('Boxplot of Average Cost for Two')
plt.show()

# Removing outliers from average cost for two
Q1 = df['average_cost_for_two'].quantile(0.25)
Q3 = df['average_cost_for_two'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['average_cost_for_two'] < (Q1 - 1.5 * IQR)) | (df['average_cost_for_two'] > (Q3 + 1.5 * IQR)))]


In [None]:
# Step 10: Feature Engineering
# Create a feature for rating category
df['rating_category'] = pd.cut(df['rating'], bins=[0, 2, 4, 6, 8, 10], labels=['Very Poor', 'Poor', 'Average', 'Good', 'Excellent'])

# Create a feature for cost category
df['cost_category'] = pd.cut(df['average_cost_for_two'], bins=[0, 50, 100, 150, 200, 300], labels=['Cheap', 'Moderate', 'Expensive', 'Very Expensive', 'Luxury'])


In [9]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Distribution of ratings
rating_dist = df['rating'].describe()
print(f"Rating Distribution:\n{rating_dist}")

# Distribution of average cost for two
cost_dist = df['average_cost_for_two'].describe()
print(f"Average Cost for Two Distribution:\n{cost_dist}")

# Average rating by city
avg_rating_by_city = df.groupby('city')['rating'].mean()
print(f"Average Rating by City:\n{avg_rating_by_city}")

# Average cost by restaurant type
avg_cost_by_type = df.groupby('restaurant_type')['average_cost_for_two'].mean()
print(f"Average Cost for Two by Restaurant Type:\n{avg_cost_by_type}")


Key Insights:


KeyError: 'rating'

Findings:
1. Ratings Distribution: The majority of restaurant ratings tend to cluster around the middle of the scale.
2. Cost Distribution: Most restaurants have moderate average costs, with fewer extremes at both low and high ends.
3. City Trends: Ratings can vary significantly between cities, indicating geographical differences in restaurant quality.
4. Restaurant Types: Different types of restaurants may have different average costs and ratings, which can inform dining choices and business strategies.