In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import folium
from folium.plugins import HeatMap
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Load the CSV file
data = pd.read_csv('Dataset .csv')

In [None]:
print(data.head())  # Display the first 5 rows of the dataset

In [None]:
print(data)

In [None]:
# LEVEL-1

In [None]:
# TASK-1
# TOP CUISINES

In [None]:
# Step 1: Check the column names to ensure the correct column is selected for cuisines
print(data.columns)

In [None]:
# Step 2: Assuming the column name for cuisines is 'Cuisines'
data.columns = data.columns.str.strip()

In [None]:
# Step 3: Count the occurrences of each cuisine
top_cuisines = data['Cuisines'].value_counts().head(3)

In [None]:
# Print the results
print(top_cuisines)

In [None]:
# Step 4: Calculate the percentage of restaurants that serve each of the top cuisines
total_restaurants = len(data)
top_cuisines_percentage = (top_cuisines / total_restaurants) * 100

In [None]:
# Print the results
print("Top 3 Cuisines:")
print(top_cuisines)

In [None]:
print("\nPercentage of Restaurants Serving Each Cuisine:")
print(top_cuisines_percentage)

In [None]:
# TASK-2
# CITY ANALYSIS

In [None]:
# Step 1: Identify the city with the highest number of restaurants
city_counts = data['City'].value_counts()
city_with_highest_restaurants = city_counts.idxmax()
highest_restaurant_count = city_counts.max()

In [None]:
print(f"City with the highest number of restaurants: {city_with_highest_restaurants}")
print(f"Number of restaurants: {highest_restaurant_count}")

In [None]:
# Step 2: Calculate the average rating for restaurants in each city
average_ratings_by_city = data.groupby('City')['Aggregate rating'].mean()

In [None]:
# Step 3: Determine the city with the highest average rating
city_with_highest_avg_rating = average_ratings_by_city.idxmax()
highest_avg_rating = average_ratings_by_city.max()

In [None]:
print(f"City with the highest average rating: {city_with_highest_avg_rating}")
print(f"Average rating: {highest_avg_rating:.2f}")

In [None]:
# TASK-3
# PRICE RANGE DISTRIBUTION

In [None]:
print(data.columns)

In [None]:
# Step 1: We will use a bar chart to visualize the distribution of price ranges.
# Assuming 'Price Range' is the column name
# Plotting the distribution of price ranges
plt.figure(figsize=(10, 6))
data['Price range'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')

In [None]:
# Adding titles and labels
plt.title('Distribution of Price Ranges Among Restaurants')
plt.xlabel('Price Range')
plt.ylabel('Number of Restaurants')

In [None]:
# Displaying the chart
plt.xticks(rotation=45)
plt.show()


In [None]:
# Step 2: calculate the percentage of restaurants in each price range category.
price_range_percentage = data['Price range'].value_counts(normalize=True) * 100

In [None]:
# Display the percentage
print(price_range_percentage)

In [None]:
# TASK-4
# ONLINE DELIVERY

In [None]:
# STEP-1:  Calculate the percentage of restaurants that offer online delivery
online_delivery_percentage = data['Has Online delivery'].value_counts(normalize=True) * 100

In [None]:
# Display the percentage
print(online_delivery_percentage)

In [None]:
#STEP-2: # Calculate the average rating for restaurants with online delivery
average_rating_with_delivery = data[data['Has Online delivery'] == 'Yes']['Aggregate rating'].mean()

In [None]:
# Calculate the average rating for restaurants without online delivery
average_rating_without_delivery = data[data['Has Online delivery'] == 'No']['Aggregate rating'].mean()

In [None]:
# Display the results
print(f"Average Rating for Restaurants with Online Delivery: {average_rating_with_delivery}")
print(f"Average Rating for Restaurants without Online Delivery: {average_rating_without_delivery}")

In [None]:
# LEVEL-2

In [None]:
# TASK-1
# RESTAURANT RATINGS

In [None]:
# STEP-1:Analyze the distribution of aggregate ratings
# Create a histogram to visualize the distribution of ratings
plt.figure(figsize=(10, 6))
plt.hist(data['Aggregate rating'], bins=10, edgecolor='black', color='skyblue')
plt.title('Distribution of Aggregate Ratings')
plt.xlabel('Ratings')
plt.ylabel('Number of Restaurants')
plt.grid(True)
plt.show()

In [None]:
# STEP-2. Determine the most common rating range
# Define rating ranges (for example, 1-2, 2-3, 3-4, etc.)
bins = [1, 2, 3, 4, 5]  # Ratings are between 1 and 5
labels = ['1-2', '2-3', '3-4', '4-5']

In [None]:
# Categorize ratings into ranges
rating_ranges = pd.cut(data['Aggregate rating'], bins=bins, labels=labels, right=False)

In [None]:
# Find the most common rating range
most_common_range = rating_ranges.value_counts().idxmax()
print(f"The most common rating range is: {most_common_range}")

In [None]:
# STEP-3. Calculate the average number of votes
# Assuming the column with votes is called 'Votes'
average_votes = data['Votes'].mean()
print(f"Average number of votes received by restaurants: {average_votes:.2f}")

In [None]:
# TASK-2
# CUISINE COMBINATION

In [None]:
# STEP-1. Identify the most common combinations of cuisines
# Split the 'Cuisine' column by commas to get a list of cuisines for each restaurant
cuisine_combinations = data['Cuisines'].dropna().apply(lambda x: tuple(sorted(x.split(','))))

In [None]:
# Count the most common combinations using the Counter class
cuisine_counter = Counter(cuisine_combinations)

In [None]:
# Display the most common 10 combinations
top_combinations = cuisine_counter.most_common(10)
print("Most Common Cuisine Combinations:")
for combo, count in top_combinations:
    print(f"{combo}: {count}")

In [None]:
# STEP-2. Determine if certain cuisine combinations tend to have higher ratings
# Group by cuisine combinations and calculate the average rating
cuisine_avg_rating = data.groupby('Cuisines')['Aggregate rating'].mean().sort_values(ascending=False)

In [None]:
# Display the average ratings for the most common combinations
print("\nAverage Ratings for Cuisine Combinations:")
for combo in top_combinations:
    combo_str = ', '.join(combo[0])
    avg_rating = cuisine_avg_rating.get(combo_str, 'N/A')
    print(f"{combo_str}: {avg_rating}")

In [None]:
# TASK-3
# GEPGRAPHIC ANALYSIS

In [None]:
# Assuming data has 'Longitude' and 'Latitude' columns for restaurant locations
# Check the first few rows to confirm column names
print(data[['Restaurant Name', 'Longitude', 'Latitude']].head())

In [None]:
# STEP-1. Plot the locations of restaurants on a map
# Create a base map centered at the average latitude and longitude
avg_lat = data['Latitude'].mean()
avg_lon = data['Longitude'].mean()

In [None]:
m = folium.Map(location=[avg_lat, avg_lon], zoom_start=12)


In [None]:
# Add each restaurant location as a marker
for index, row in data.iterrows():
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=row['Restaurant Name']
    ).add_to(m)

In [None]:
# Display the map
m.save("restaurant_map.html")
print("Map saved as 'restaurant_map.html'. Open it in your browser to view the plot.")

In [None]:
# STEP-2:Identify patterns or clusters of restaurants
# First, remove any rows with missing values in Longitude and Latitude
data_clean = data.dropna(subset=['Longitude', 'Latitude'])

In [None]:
# Prepare the data for clustering (only Longitude and Latitude)
coords = data_clean[['Longitude', 'Latitude']]

In [None]:
# Perform K-means clustering to find patterns
kmeans = KMeans(n_clusters=5, random_state=42)
data_clean['Cluster'] = kmeans.fit_predict(coords)

In [None]:
# Plot the clusters on a map with different colors
m_clustered = folium.Map(location=[avg_lat, avg_lon], zoom_start=12)

In [None]:
# Create a color map for clusters
cluster_colors = ['red', 'blue', 'green', 'purple', 'orange']

In [None]:
# Plot each cluster with a different color
for i, row in data_clean.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color=cluster_colors[row['Cluster']],
        fill=True,
        fill_color=cluster_colors[row['Cluster']],
        fill_opacity=0.6,
        popup=f"Cluster {row['Cluster']} - {row['Restaurant Name']}"
    ).add_to(m_clustered)


In [None]:
# Display the clustered map
m_clustered.save("restaurant_clustered_map.html")
print("Clustered map saved as 'restaurant_clustered_map.html'. Open it in your browser to view the plot.")


In [None]:
# TASK-4
# RESTAURANT CHAINS

In [None]:
# Assuming the dataset is loaded into a pandas DataFrame called 'data'
# Check the first few rows to understand the structure of the dataset
print(data[['Restaurant Name', 'Aggregate rating', 'Votes']].head())

In [None]:
# STEP-1:Identify restaurant chains
# Assuming that restaurant names are similar for restaurant chains (e.g., "McDonald's" has multiple locations)
# Group by the 'Restaurant' column to identify chains (i.e., restaurants with multiple occurrences)
restaurant_chain_counts = data['Restaurant Name'].value_counts()

In [None]:
# Filter out restaurants with more than one occurrence, indicating a chain
restaurant_chains = restaurant_chain_counts[restaurant_chain_counts > 1]
print("Restaurant Chains:\n", restaurant_chains)


In [None]:
# STEP-2. Analyze the ratings and popularity of different restaurant chains
# Calculate the average rating and total votes for each restaurant chain
chain_analysis = data.groupby('Restaurant Name').agg(
    average_rating=('Aggregate rating', 'mean'),
    total_votes=('Votes', 'sum')
).loc[restaurant_chains.index]

In [None]:
# Sort by average rating and total votes for analysis
chain_analysis_sorted_by_rating = chain_analysis.sort_values(by='average_rating', ascending=False)
chain_analysis_sorted_by_popularity = chain_analysis.sort_values(by='total_votes', ascending=False)

In [None]:
# Display the results
print("\nRestaurant Chains Sorted by Average Rating:\n", chain_analysis_sorted_by_rating.head())
print("\nRestaurant Chains Sorted by Popularity (Total Votes):\n", chain_analysis_sorted_by_popularity.head())

In [None]:
# LEVEL-3:-

In [None]:
# TASK-1:
# RESTAURANT REVIEWS

In [None]:

# Assuming the dataset is loaded into a pandas DataFrame called 'data'
# Ensure that the 'Reviews' column exists in the dataset (it could also be named differently like 'Review_Text')
reviews = data['Rating text']

# Initialize NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# List of stopwords
stop_words = set(stopwords.words('english'))

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Preprocess reviews
processed_reviews = []
for review in reviews:
    words = word_tokenize(review.lower())  # Tokenize and convert to lowercase
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words]  # Remove stopwords and non-alphabetical words
    processed_reviews.append(filtered_words)

In [None]:
# STEP-1.
positive_keywords = []
negative_keywords = []

for review in processed_reviews:
    sentiment_score = sia.polarity_scores(' '.join(review))['compound']  # Sentiment score
    if sentiment_score > 0:
        positive_keywords.extend(review)
    elif sentiment_score < 0:
        negative_keywords.extend(review)

In [None]:

# Find the most common positive and negative keywords
from collections import Counter
positive_counter = Counter(positive_keywords)
negative_counter = Counter(negative_keywords)

most_common_positive = positive_counter.most_common(10)  # Top 10 positive keywords
most_common_negative = negative_counter.most_common(10)  # Top 10 negative keywords


In [None]:
# Print the results
print("Most Common Positive Keywords:", most_common_positive)
print("Most Common Negative Keywords:", most_common_negative)

In [None]:
# STEP-2.Calculate the average length of reviews
review_lengths = [len(review.split()) for review in reviews]
average_length = sum(review_lengths) / len(review_lengths)

In [None]:
# Plot the relationship between review length and rating
data['Review Length'] = review_lengths
plt.scatter(data['Review Length'], data['Aggregate rating'])
plt.xlabel('Review Length')
plt.ylabel('Aggregrate rating')
plt.title('Review Length vs Rating')
plt.show()

In [None]:
# Print the average length
print(f"Average Length of Reviews: {average_length:.2f} words")

In [None]:
# TASK-2:-
# VOTES ANALYSIS

In [None]:
# STEP-1.Identify the highest and lowest number of votes
data['Votes'] = data['Votes'].astype(int)  # Ensure 'Votes' is in integer format
highest_votes = data.loc[data['Votes'].idxmax()]
lowest_votes = data.loc[data['Votes'].idxmin()]

print("Restaurant with highest votes:", highest_votes[['Restaurant Name', 'Votes']])
print("Restaurant with lowest votes:", lowest_votes[['Restaurant Name', 'Votes']])

In [None]:
# STEP-2.Correlation between the number of votes and rating
correlation = data[['Votes', 'Aggregate rating']].corr()  # Calculate correlation matrix
print("Correlation between votes and rating:", correlation)

In [None]:
# TASK-3:-
# PRICE RANGE VS ONLINE DELIVERY AND TABLE BOOKING

In [None]:
# STEP-1.Analyze if there is a relationship between price range and the availability of online delivery and table booking

# Convert the relevant columns to boolean (True/False) for easy analysis
data['Has Online delivery'] = data['Has Online delivery'].map({'Yes': True, 'No': False})
data['Has Table booking'] = data['Has Table booking'].map({'Yes': True, 'No': False})

In [None]:
# Calculate the percentage of restaurants offering online delivery and table booking across different price ranges
price_range_groups = data.groupby('Price range').agg({
    'Has Online delivery': 'mean',
    'Has Table booking': 'mean'
})
print(price_range_groups)

In [None]:
# Plotting the results
fig, ax = plt.subplots(1, 2, figsize=(14, 7))

In [None]:
# STEP-2.Online Delivery vs Price Range
price_range_groups['Has Online delivery'].plot(kind='bar', ax=ax[0])
ax[0].set_title('Online delivery vs Price range')
ax[0].set_ylabel('Percentage Offering Online Delivery')

In [None]:
# STEP-3.Table Booking vs Price Range
price_range_groups['Has Table booking'].plot(kind='bar', ax=ax[1])
ax[1].set_title('Table Booking vs Price Range')
ax[1].set_ylabel('Percentage Offering Table Booking')

In [None]:
plt.tight_layout()
plt.show()