In [2]:
import pandas as pd

In [4]:
df = pd.read_json('./yelp_dataset/yelp_academic_dataset_business.json', lines=True)

In [6]:
df['location'] = df.apply(
    lambda row: {
        "type": "Point",
        "coordinates": [row['longitude'], row['latitude']]
    },
    axis=1
)

In [8]:
df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,location
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,"{'type': 'Point', 'coordinates': [-119.7111968..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...","{'type': 'Point', 'coordinates': [-90.335695, ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","{'type': 'Point', 'coordinates': [-110.880452,..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","{'type': 'Point', 'coordinates': [-75.1555641,..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...","{'type': 'Point', 'coordinates': [-75.4716585,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3...","{'type': 'Point', 'coordinates': [-113.4920537..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3...","{'type': 'Point', 'coordinates': [-86.766925, ..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",,"{'type': 'Point', 'coordinates': [-86.065088, ..."
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...","{'type': 'Point', 'coordinates': [-89.9505584,..."


In [4]:
import pandas as pd
from pymongo import MongoClient
import sys

# --- 1. Load your DataFrame ---
try:
    df = pd.read_json(
        './yelp_dataset/yelp_academic_dataset_business.json', 
        lines=True
    )
    print(f"Successfully loaded {len(df)} total businesses.")
except Exception as e:
    print(f"Error reading JSON file: {e}")
    sys.exit()

# --- 2. Prepare Data for MongoDB ---
print("Preparing data for insertion...")

# Drop businesses that have missing lat/long or state
df = df.dropna(subset=['latitude', 'longitude', 'state'])

# Create the 'location' field in GeoJSON format
# IMPORTANT: Coordinates are [longitude, latitude]
df['location'] = df.apply(
    lambda row: {
        "type": "Point",
        "coordinates": [row['longitude'], row['latitude']]
    },
    axis=1
)
print(f"Data prepared. {len(df)} valid businesses to insert.")


# --- 3. Connect to the MongoDB Router ---
# This is the ONLY connection string you need.
# It connects to the 'mongo-router' container's exposed port.
CONNECTION_STRING = "mongodb://localhost:27017/"

try:
    client = MongoClient(CONNECTION_STRING)
    client.admin.command('ping')
    print("MongoDB router connection successful.")
except Exception as e:
    print(f"Error connecting to MongoDB router: {e}")
    sys.exit()

# Define your database and collection
db = client['test']
collection = db['businesses']

# --- 4. Insert the Data ---
# Convert the DataFrame to a list of dictionaries
records_to_insert = df.to_dict('records')

if not records_to_insert:
    print("No records to insert.")
    sys.exit()

print(f"Inserting {len(records_to_insert)} records into 'yelp_data.businesses'...")

try:
    # Clear the collection for a fresh run (optional)
    collection.delete_many({})
    
    # Insert all records
    result = collection.insert_many(records_to_insert)
    print(f"  SUCCESS: Inserted {len(result.inserted_ids)} records.")
    print("  MongoDB is automatically partitioning this data across your 5 shards!")
    
except Exception as e:
    print(f"  FAILED to insert records: {e}")

finally:
    client.close()
    print("Connection closed.")

Successfully loaded 150346 total businesses.
Preparing data for insertion...
Data prepared. 150346 valid businesses to insert.
MongoDB router connection successful.
Inserting 150346 records into 'yelp_data.businesses'...
  SUCCESS: Inserted 150346 records.
  MongoDB is automatically partitioning this data across your 5 shards!
Connection closed.


In [None]:
df_rev = pd.read_json('./yelp_dataset/yelp_academic_dataset_review.json', lines=True)

In [12]:
import pandas as pd
from pymongo import MongoClient
import sys

# --- 1. Create the Business Lookup Map ---
print("Loading business data to create lookup map...")
try:
    df_biz = pd.read_json(
        './yelp_dataset/yelp_academic_dataset_business.json', 
        lines=True
    )
    
    # Drop businesses with no location data
    df_biz = df_biz.dropna(subset=['latitude', 'longitude', 'state'])
    
    # Create the GeoJSON 'location' field
    df_biz['location'] = df_biz.apply(
        lambda row: {
            "type": "Point",
            "coordinates": [row['longitude'], row['latitude']]
        },
        axis=1
    )
    
    # Create the map: 'business_id' -> {'state': 'AZ', 'location': {...}}
    biz_map = df_biz.set_index('business_id')[['state', 'location']].to_dict('index')
    
    print(f"Business map created with {len(biz_map)} entries.")

except Exception as e:
    print(f"Error loading business JSON: {e}")
    sys.exit()

# --- 2. Connect to MongoDB Router ---
CONNECTION_STRING = "mongodb://localhost:27017/"
try:
    client = MongoClient(CONNECTION_STRING)
    client.admin.command('ping')
    db = client['yelp_data']
    reviews_collection = db['reviews']
    
    # Clear old data for a fresh run
    reviews_collection.delete_many({})
    print("Connected to MongoDB and cleared old reviews.")
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    sys.exit()

# --- 3. Stream, Enrich, and Insert Reviews ---
review_file_path = './yelp_dataset/yelp_academic_dataset_review.json'
chunk_size = 50000  # Process 50,000 reviews at a time
total_inserted = 0

print(f"Starting review insertion from {review_file_path}...")

try:
    with pd.read_json(review_file_path, lines=True, chunksize=chunk_size) as reader:
        for chunk_df in reader:
            # --- This is the ENRICHMENT step ---
            
            # 1. Map 'state' from the biz_map
            chunk_df['state'] = chunk_df['business_id'].map(
                lambda x: biz_map.get(x, {}).get('state')
            )
            
            # 2. Map 'location' from the biz_map
            chunk_df['location'] = chunk_df['business_id'].map(
                lambda x: biz_map.get(x, {}).get('location')
            )
            
            # 3. Drop reviews for businesses we couldn't find (e.g., no state)
            chunk_df = chunk_df.dropna(subset=['state'])
            
            if chunk_df.empty:
                print("  Skipped chunk (no matching businesses).")
                continue
                
            # 4. Convert to dict and insert
            records_to_insert = chunk_df.to_dict('records')
            reviews_collection.insert_many(records_to_insert)
            
            total_inserted += len(records_to_insert)
            print(f"  Inserted {len(records_to_insert)} records. Total: {total_inserted}")

except Exception as e:
    print(f"Error reading or inserting reviews: {e}")
finally:
    client.close()
    print(f"\nInsertion complete. Total {total_inserted} reviews inserted.")

Loading business data to create lookup map...
Business map created with 150346 entries.
Connected to MongoDB and cleared old reviews.
Starting review insertion from ./yelp_dataset/yelp_academic_dataset_review.json...
  Inserted 50000 records. Total: 50000
  Inserted 50000 records. Total: 100000
  Inserted 50000 records. Total: 150000
  Inserted 50000 records. Total: 200000
  Inserted 50000 records. Total: 250000
  Inserted 50000 records. Total: 300000
  Inserted 50000 records. Total: 350000
  Inserted 50000 records. Total: 400000
  Inserted 50000 records. Total: 450000
  Inserted 50000 records. Total: 500000
  Inserted 50000 records. Total: 550000
  Inserted 50000 records. Total: 600000
  Inserted 50000 records. Total: 650000

Insertion complete. Total 650000 reviews inserted.


KeyboardInterrupt: 

In [None]:
db.reviews.aggregate([
  {
    $vectorSearch: {
      "index": "vector_index_reviews",
      "path": "embedding",
      "queryVector": query_vector,
      "numCandidates": 150, // How many to check
      "limit": 10           // How many to return
    }
  },
  {
    // Add a projection to show the score and text
    $project: {
      score: { $meta: "vectorSearchScore" },
      text: 1,
      state: 1
    }
  }
])