In [2]:
import pandas as pd
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["yelp_dataset"]

businesses_collection = db["businesses"]
reviews_collection = db["reviews"]
users_collection = db["users"]

# make sure to add run the code to do this in setup.ipynb before running this
business_file = "sampled_businesses.csv"
review_file = "filtered_reviews.csv"
user_file = "filtered_users.csv"

# Load and insert data into MongoDB
def load_csv_to_mongo(file_path, collection):
    data = pd.read_csv(file_path)
    records = data.to_dict(orient="records")  # Convert DataFrame to list of dicts
    collection.insert_many(records)
    print(f"Inserted {len(records)} records into {collection.name} collection.")

# Load data into collections
load_csv_to_mongo(business_file, businesses_collection)
load_csv_to_mongo(review_file, reviews_collection)
load_csv_to_mongo(user_file, users_collection)

Inserted 10000 records into businesses collection.
Inserted 445853 records into reviews collection.
Inserted 295709 records into users collection.


In [4]:
print("Number of businesses:", businesses_collection.count_documents({}))
print("Number of reviews:", reviews_collection.count_documents({}))
print("Number of users:", users_collection.count_documents({}))

Number of businesses: 130000
Number of reviews: 5796089
Number of users: 3844217


In [5]:
# Query 1
import random
def get_top_cities_by_rating():
    return [
        {
            "$match": { 
                "categories": { "$regex": "Restaurant", "$options": "i" }
            }
        },
        {
            "$group": {
                "_id": "$city",
                "average_rating": { "$avg": "$stars" },
                "total_reviews": { "$sum": "$review_count" }
            }
        },
        {
            "$match": {
                "total_reviews": { "$gte": 100 }
            }
        },
        {
            "$sort": { "average_rating": -1 }
        },
        {
            "$limit": 10  # Limit to top 10 cities
        }
    ]

from bson.json_util import dumps
def get_sampled_results(collection, pipeline, sample_size=5):

    results = list(collection.aggregate(pipeline))
    
    if len(results) > sample_size:
        sampled_results = random.sample(results, sample_size)
    else:
        sampled_results = results 
    
    return sampled_results

pipeline = get_top_cities_by_rating()
sampled_results = get_sampled_results(businesses_collection, pipeline)

print("Sampled Results (Top Cities by Average Restaurant Ratings):")
for result in sampled_results:
    print(f"City: {result['_id']}, Average Rating: {result['average_rating']:.2f}, Total Reviews: {result['total_reviews']}")

explain_result = db.command(
    {
        "explain": {
            "aggregate": "businesses",
            "pipeline": pipeline,
            "cursor": {}
        },
        "verbosity": "executionStats"
    }
)

print("\nPerformance Analysis:")
print(dumps(explain_result, indent=4))

Sampled Results (Top Cities by Average Restaurant Ratings):
City: San Antonio, Average Rating: 5.00, Total Reviews: 182
City: Camby, Average Rating: 4.50, Total Reviews: 156
City: Dresher, Average Rating: 4.50, Total Reviews: 1443
City: Twn N Cntry, Average Rating: 4.50, Total Reviews: 234
City: Belle Chasse, Average Rating: 4.50, Total Reviews: 143

Performance Analysis:
{
    "explainVersion": "1",
    "stages": [
        {
            "$cursor": {
                "queryPlanner": {
                    "namespace": "yelp_dataset.businesses",
                    "indexFilterSet": false,
                    "parsedQuery": {
                        "categories": {
                            "$regex": "Restaurant",
                            "$options": "i"
                        }
                    },
                    "queryHash": "B447A271",
                    "planCacheKey": "B447A271",
                    "maxIndexedOrSolutionsReached": false,
                    "maxIndexedA

In [6]:
from bson.json_util import dumps
import random

# Query 2
def get_top_category_by_city():

    return [
        {
            "$unwind": "$categories"
        },
        {
            "$group": {
                "_id": {
                    "city": "$city",
                    "category": "$categories"
                },
                "business_count": {"$sum": 1}
            }
        },
        {
            "$sort": {
                "_id.city": 1,
                "business_count": -1
            }
        },
        {
            "$group": {
                "_id": "$_id.city",
                "top_category": {"$first": "$_id.category"},
                "business_count": {"$first": "$business_count"}
            }
        },
        {
            "$sort": {"_id": 1}
        }
    ]


pipeline = get_top_category_by_city()
sampled_results = get_sampled_results(businesses_collection, pipeline)

print("Sampled Results (Top Category by City):")
for result in sampled_results:
    print(f"City: {result['_id']}, Top Category: {result['top_category']}, Business Count: {result['business_count']}")

explain_result = db.command(
    {
        "explain": {
            "aggregate": "businesses",
            "pipeline": pipeline,
            "cursor": {}
        },
        "verbosity": "executionStats"
    }
)

print("\nPerformance Analysis:")
print(dumps(explain_result, indent=4))

Sampled Results (Top Category by City):
City: Kuna, Top Category: Grocery, Food, Business Count: 13
City: Castleton, Top Category: Ophthalmologists, Shopping, Doctors, Optometrists, Health & Medical, Eyewear & Opticians, Business Count: 13
City: Willow Grove, Top Category: Photographers, Event Planning & Services, Video/Film Production, Professional Services, Business Count: 13
City: Millstadt, Top Category: Food, Coffee & Tea, Business Count: 13
City: Meridian, Top Category: Motorcycle Dealers, Automotive, Auto Parts & Supplies, Shopping, Motorcycle Gear, Motorcycle Repair, Tires, Business Count: 13

Performance Analysis:
{
    "explainVersion": "1",
    "stages": [
        {
            "$cursor": {
                "queryPlanner": {
                    "namespace": "yelp_dataset.businesses",
                    "indexFilterSet": false,
                    "parsedQuery": {},
                    "queryHash": "1D37A11F",
                    "planCacheKey": "1D37A11F",
                  