In [1]:
# location_based_analysis.py
# --------------------------------------------------
# Location-Based Restaurant Analysis (Cognifyz)
# --------------------------------------------------
# What this script does:
# 1) Loads the provided Zomato-style dataset ("Dataset .csv" or "Dataset.csv")
# 2) Cleans coordinates and basic fields
# 3) Creates interactive maps (MarkerCluster + Heatmap) with Folium
# 4) Builds city & locality summaries (counts, avg rating, avg price range)
# 5) Extracts cuisine insights (overall & per city)
# 6) Saves clean CSV summaries and Matplotlib charts for your report
#
# Outputs saved to the current folder:
# - map_cluster_top1000.html
# - map_cluster_full.html (may be large; use locally)
# - map_heat.html
# - map_heat_top500.html
# - city_summary.csv
# - locality_summary.csv
# - top_localities_per_city.csv
# - top_cuisines_overall.csv
# - top3_cuisines_by_city.csv
# - top_cities_by_count.png
# - top_cities_by_rating_threshold.png
#
# How to run:
# 1) Put this file and the dataset CSV in the same folder.
# 2) Make sure you have packages:
#       pip install pandas folium matplotlib
# 3) Run:
#       python location_based_analysis.py
# --------------------------------------------------

import os
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Optional: import folium only when needed to avoid errors if not installed
try:
    import folium
    from folium.plugins import MarkerCluster, HeatMap
    FOLIUM_AVAILABLE = True
except Exception as _:
    FOLIUM_AVAILABLE = False
    print("Note: folium is not installed. Maps will be skipped. Install with: pip install folium")

# --------------------------------
# Load dataset (supports two names)
# --------------------------------
csv_paths = ["Dataset .csv", "Dataset.csv"]
df = None
for p in csv_paths:
    if os.path.exists(p):
        try:
            df = pd.read_csv(p, encoding="latin-1")
            print(f"Loaded dataset: {p}")
            break
        except Exception as e:
            print(f"Found {p} but failed to read due to: {e}")
if df is None:
    raise FileNotFoundError("Dataset file not found. Place 'Dataset .csv' or 'Dataset.csv' beside this script.")

print("Shape:", df.shape)
print("Columns:", list(df.columns))

Loaded dataset: Dataset .csv
Shape: (9551, 21)
Columns: ['ï»¿Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes']


In [2]:
# --------------------------------
# Basic cleaning & standard fields
# --------------------------------
# Ensure expected columns exist
required_cols = [
    'Latitude', 'Longitude', 'City', 'Locality', 'Cuisines',
    'Aggregate rating', 'Price range', 'Votes', 'Restaurant Name'
]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Enforce numeric types where useful
for col in ['Latitude', 'Longitude', 'Aggregate rating', 'Price range', 'Votes']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Keep valid coordinates
df = df[(df['Latitude'].between(-90, 90)) & (df['Longitude'].between(-180, 180))].copy()

# Treat rating 0 as missing for averages
df['Rating_for_avg'] = df['Aggregate rating'].replace(0, np.nan)

# Fill missing Cuisines with 'Unknown'
df['Cuisines'] = df['Cuisines'].fillna('Unknown')

# Drop obvious duplicates by Restaurant Name + City + Locality + coordinates
df = df.drop_duplicates(subset=['Restaurant Name', 'City', 'Locality', 'Latitude', 'Longitude'])

print("After cleaning, shape:", df.shape)

After cleaning, shape: (9551, 22)


In [3]:
# --------------------------------
# Interactive Maps (Folium)
# --------------------------------
center = [df['Latitude'].mean(), df['Longitude'].mean()]

if FOLIUM_AVAILABLE:
    # MarkerCluster (Top 1000 by rating then votes) — shareable size
    df_top = df.sort_values(['Aggregate rating', 'Votes'], ascending=[False, False]).head(1000)
    m_cluster = folium.Map(location=center, zoom_start=3)
    cluster = MarkerCluster().add_to(m_cluster)
    for _, r in df_top.iterrows():
        popup = folium.Popup(
            f"<b>{r.get('Restaurant Name','')}</b><br>"
            f"City: {r.get('City','')}<br>"
            f"Locality: {r.get('Locality','')}<br>"
            f"Cuisines: {r.get('Cuisines','')}<br>"
            f"Rating: {r.get('Aggregate rating', np.nan)} | Votes: {r.get('Votes', np.nan)}",
            max_width=350
        )
        folium.Marker([r['Latitude'], r['Longitude']], tooltip=r.get('Restaurant Name',''), popup=popup).add_to(cluster)
    m_cluster.save("map_cluster_top1000.html")
    print("Saved: map_cluster_top1000.html")

    # Full dataset cluster (bigger file – open locally)
    m_full = folium.Map(location=center, zoom_start=3)
    cluster_full = MarkerCluster().add_to(m_full)
    for _, r in df.iterrows():
        popup = folium.Popup(
            f"<b>{r.get('Restaurant Name','')}</b><br>"
            f"City: {r.get('City','')}<br>"
            f"Locality: {r.get('Locality','')}<br>"
            f"Cuisines: {r.get('Cuisines','')}<br>"
            f"Rating: {r.get('Aggregate rating', np.nan)} | Votes: {r.get('Votes', np.nan)}",
            max_width=350
        )
        folium.Marker([r['Latitude'], r['Longitude']], tooltip=r.get('Restaurant Name',''), popup=popup).add_to(cluster_full)
    m_full.save("map_cluster_full.html")
    print("Saved: map_cluster_full.html (may be large)")

    # Heatmap (all points)
    m_heat = folium.Map(location=center, zoom_start=3)
    heat_data = df[['Latitude','Longitude']].dropna().values.tolist()
    HeatMap(heat_data, radius=8, blur=6).add_to(m_heat)
    m_heat.save("map_heat.html")
    print("Saved: map_heat.html")

    # Heatmap (top 500 by rating & votes)
    df_top500 = df.sort_values(['Aggregate rating','Votes'], ascending=[False, False]).head(500)
    m_heat_top = folium.Map(location=center, zoom_start=4)
    HeatMap(df_top500[['Latitude','Longitude']].dropna().values.tolist(), radius=8, blur=6).add_to(m_heat_top)
    m_heat_top.save("map_heat_top500.html")
    print("Saved: map_heat_top500.html")

Saved: map_cluster_top1000.html
Saved: map_cluster_full.html (may be large)
Saved: map_heat.html
Saved: map_heat_top500.html


In [4]:
# --------------------------------
# City-level summaries & charts
# --------------------------------
city_counts = df['City'].value_counts().rename('Restaurants')
city_avg_rating = df.groupby('City')['Rating_for_avg'].mean().rename('Avg_Rating')
city_avg_price = df.groupby('City')['Price range'].mean().rename('Avg_PriceRange')

city_summary = pd.concat([city_counts, city_avg_rating, city_avg_price], axis=1).sort_values('Restaurants', ascending=False)
city_summary.to_csv("city_summary.csv", index=True)
print("Saved: city_summary.csv")

# Charts with matplotlib (no seaborn, no explicit colors)
plt.figure(figsize=(10,5))
city_counts.head(10).plot(kind='bar')
plt.title("Top 10 Cities by Number of Restaurants")
plt.xlabel("City"); plt.ylabel("Restaurants"); plt.xticks(rotation=45, ha='right')
plt.tight_layout(); plt.savefig("top_cities_by_count.png"); plt.close()
print("Saved: top_cities_by_count.png")

# Avoid small-sample bias: only consider cities with >= 20 restaurants
eligible = city_summary[city_summary['Restaurants'] >= 20]['Avg_Rating'].dropna().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
eligible.plot(kind='bar')
plt.title("Top 10 Cities by Avg Rating (>= 20 restaurants)")
plt.xlabel("City"); plt.ylabel("Average Rating"); plt.xticks(rotation=45, ha='right')
plt.tight_layout(); plt.savefig("top_cities_by_rating_threshold.png"); plt.close()
print("Saved: top_cities_by_rating_threshold.png")

Saved: city_summary.csv
Saved: top_cities_by_count.png
Saved: top_cities_by_rating_threshold.png


In [5]:
# --------------------------------
# Locality-level summaries
# --------------------------------
locality_grp = df.groupby(['City','Locality']).agg(
    Restaurants=('Restaurant Name','count'),
    Avg_Rating=('Rating_for_avg','mean'),
    Avg_PriceRange=('Price range','mean')
).reset_index().sort_values(['City','Restaurants'], ascending=[True, False])
locality_grp.to_csv("locality_summary.csv", index=False)
print("Saved: locality_summary.csv")

# Top localities per city (by restaurant count) - top 10 each
top_localities = (locality_grp.sort_values(['City','Restaurants'], ascending=[True, False])
                             .groupby('City').head(10))
top_localities.to_csv("top_localities_per_city.csv", index=False)
print("Saved: top_localities_per_city.csv")

Saved: locality_summary.csv
Saved: top_localities_per_city.csv


In [6]:
# --------------------------------
# Cuisine insights
# --------------------------------
c = df[['City','Cuisines']].copy()
c['Cuisines'] = c['Cuisines'].fillna('Unknown').astype(str).str.split(', ')
c = c.explode('Cuisines')

top_cuisines_overall = c['Cuisines'].value_counts()
top_cuisines_overall.to_csv("top_cuisines_overall.csv", header=['Count'])
print("Saved: top_cuisines_overall.csv")

top3_by_city = (c.groupby(['City','Cuisines']).size().reset_index(name='Count')
                  .sort_values(['City','Count'], ascending=[True, False])
                  .groupby('City').head(3))
top3_by_city.to_csv("top3_cuisines_by_city.csv", index=False)
print("Saved: top3_cuisines_by_city.csv")

# --------------------------------
# Extra: Simple correlation check (price vs rating at city level)
# --------------------------------
corr_df = city_summary.dropna(subset=['Avg_Rating','Avg_PriceRange'])
corr_val = corr_df['Avg_Rating'].corr(corr_df['Avg_PriceRange'])
print(f"Correlation between Avg Price Range and Avg Rating across cities: {corr_val:.3f}")

print("\nAll done ✅")
print("Now open the HTML maps in your browser and attach the CSVs/PNGs in your report.")

Saved: top_cuisines_overall.csv
Saved: top3_cuisines_by_city.csv
Correlation between Avg Price Range and Avg Rating across cities: 0.396

All done ✅
Now open the HTML maps in your browser and attach the CSVs/PNGs in your report.
