In [1]:
import pandas as pd
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["yelp_dataset"]

businesses_collection = db["businesses"]
reviews_collection = db["reviews"]
users_collection = db["users"]

# make sure to add run the code to do this in setup.ipynb before running this
business_file = "sampled_businesses.csv"
review_file = "filtered_reviews.csv"
user_file = "filtered_users.csv"

# Load and insert data into MongoDB
def load_csv_to_mongo(file_path, collection):
    data = pd.read_csv(file_path)
    records = data.to_dict(orient="records")  # Convert DataFrame to list of dicts
    collection.insert_many(records)
    print(f"Inserted {len(records)} records into {collection.name} collection.")

# Load data into collections
load_csv_to_mongo(business_file, businesses_collection)
load_csv_to_mongo(review_file, reviews_collection)
load_csv_to_mongo(user_file, users_collection)

Inserted 10000 records into businesses collection.
Inserted 445853 records into reviews collection.
Inserted 295709 records into users collection.


In [2]:
print("Number of businesses:", businesses_collection.count_documents({}))
print("Number of reviews:", reviews_collection.count_documents({}))
print("Number of users:", users_collection.count_documents({}))

Number of businesses: 110000
Number of reviews: 4904383
Number of users: 3252799


In [3]:
# sample easy query
five_star_businesses = businesses_collection.find({"stars": 5})

pipeline = [
    {"$sort": {"review_count": -1}},
    {"$limit": 10}
]
top_users = users_collection.aggregate(pipeline)
for user in top_users:
    print(user)

{'_id': ObjectId('675265316a8ff6c2bc191dd5'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('6755385c9b0ba8da4550c27b'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67553c599b0ba8da4567b212'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67524b8fccf0cdec7362e6e9'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67524c5dccf0cdec736e5eb4'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67554372accaec1aacd2a056'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67553cda4d07558db1144c2d'), 'user_id': 'Hi10sGSZNxQH3NLyWSZ1oA', 'name': 'Fox', 'review_count': 17473, 'avg_stars': 3.77}
{'_id': ObjectId('67

In [4]:
def get_top_cities_by_rating():
    """
    Builds a pipeline to aggregate top 10 cities by average rating for restaurants with at least 100 reviews.
    
    Returns:
        list: MongoDB aggregation pipeline.
    """
    return [
        {
            "$match": { 
                "categories": { "$regex": "Restaurant", "$options": "i" }
            }
        },
        {
            "$group": {
                "_id": "$city",
                "average_rating": { "$avg": "$stars" },
                "total_reviews": { "$sum": "$review_count" }
            }
        },
        {
            "$match": {
                "total_reviews": { "$gte": 100 }
            }
        },
        {
            "$sort": { "average_rating": -1 }
        },
        {
            "$limit": 10  # Limit to top 10 cities
        }
    ]

from bson.json_util import dumps
pipeline = get_top_cities_by_rating()
results = businesses_collection.aggregate(pipeline)

print("Top 10 Cities by Average Restaurant Ratings:")
for result in results:
    print(f"City: {result['_id']}, Average Rating: {result['average_rating']}, Total Reviews: {result['total_reviews']}")

explain_result = db.command(
        {
        "explain": {
            "aggregate": "businesses",
            "pipeline": pipeline,
            "cursor": {}
        },
        "verbosity": "executionStats"
    }
)

print("\nPerformance Analysis:")
print(dumps(explain_result, indent=4))

Top 10 Cities by Average Restaurant Ratings:
City: San Antonio, Average Rating: 5.0, Total Reviews: 154
City: Harrison Township, Average Rating: 5.0, Total Reviews: 176
City: Camby, Average Rating: 4.5, Total Reviews: 132
City: Glendora, Average Rating: 4.5, Total Reviews: 5434
City: Gladwyne, Average Rating: 4.5, Total Reviews: 187
City: Redington Shores, Average Rating: 4.5, Total Reviews: 495
City: Perkasie, Average Rating: 4.5, Total Reviews: 2167
City: Glenmoore, Average Rating: 4.5, Total Reviews: 110
City: Mount Laurel Township, Average Rating: 4.5, Total Reviews: 2156
City: Pass-a-Grille Beach, Average Rating: 4.5, Total Reviews: 1672

Performance Analysis:
{
    "explainVersion": "1",
    "stages": [
        {
            "$cursor": {
                "queryPlanner": {
                    "namespace": "yelp_dataset.businesses",
                    "indexFilterSet": false,
                    "parsedQuery": {
                        "categories": {
                            "$