In [8]:
import os
import sys
import json
import pandas as pd
from collections import Counter

src_path = r"D:\SEM 4\CS516\Yelp Fairness Review\scripts"
if src_path not in sys.path:
    sys.path.append(src_path)

from preprocessing import (
    
      add_chain_flags,
    sample_reviews_per_city, add_category_flags, clean_reviews
)

In [9]:
businesses = []
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i > 10000:  # Just load first 10k rows for a quick look
            break
        businesses.append(json.loads(line))
raw_business_df = pd.DataFrame(businesses)
print("Unique cities in sample:", raw_business_df['city'].unique())

Unique cities in sample: ['Santa Barbara' 'Affton' 'Tucson' 'Philadelphia' 'Green Lane'
 'Ashland City' 'Brentwood' 'St. Petersburg' 'Nashville' "Land O' Lakes"
 'Tampa Bay' 'Indianapolis' 'Clearwater' 'Largo' 'New Orleans' 'Kenner'
 'Edmonton' 'Reno' 'Newtown' 'White House' 'Boise' 'Paoli' 'Ardmore'
 'Exton' 'Wilmington' 'Edwardsville' 'Sparks' 'Alton' 'Cherry Hill'
 'Bala Cynwyd' 'Springfield' 'Belleville' 'Carmel' 'Tampa'
 'Kennett Square' 'Plymouth Meeting' 'Harvey' 'West Chester' 'Meridian'
 'Hudson' 'Fernley' 'Williamstown' 'Pinellas Park' 'Glenolden'
 'Wesley Chapel' 'Fishers' 'Burlington' 'Troy' 'Camden' 'Plainfield'
 'Bensalem' 'Maplewood' 'Saint Louis' 'Fairview Heights' 'Oro Valley'
 'Treasure Island' 'Southampton' 'Chalfont' 'Willow Grove' 'Voorhees'
 'Tarpon Springs' 'Blue Bell' 'Metairie' 'Woodbury' 'Brownsburg'
 'Norristown' 'Land O Lakes' 'Greenwood' 'Saint Petersburg' 'Brookhaven'
 'Haverford' 'Glenside' 'Moorestown' 'Madison' 'Ewing' 'Levittown'
 'Gloucester Township'

In [10]:
print("Unique categories in sample:", raw_business_df['categories'].unique())


Unique categories in sample: ['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists'
 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services'
 'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores'
 ... 'Bars, Nightlife, Airport Lounges'
 'Professional Services, Local Services, Printing Services, Graphic Design'
 'Burgers, Ice Cream & Frozen Yogurt, Restaurants, Food, Fast Food']


In [11]:
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for i, line in enumerate(f):
        pass
print(f"Total businesses: {i+1}")


Total businesses: 150346


In [12]:
target_substrings = ['las vegas', 'toronto', 'phoenix', 'charlotte', 'edinburgh', 'pittsburgh']
city_counter = {}

with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city_lower = business['city'].strip().lower()
        for t in target_substrings:
            if t in city_lower:
                city_counter.setdefault(city_lower, 0)
                city_counter[city_lower] += 1

print("Cities with matching substrings and their counts:")
for k, v in city_counter.items():
    print(f"{k}: {v}")


Cities with matching substrings and their counts:
phoenixville: 366
charlotte: 1


In [13]:
# Count number of businesses per city
city_counter = Counter()
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city = business['city'].strip()
        city_counter[city] += 1

# Show top 10 cities by business count
top_cities = city_counter.most_common(10)
print("Top cities in dataset by number of businesses:")
for city, count in top_cities:
    print(f"{city}: {count}")

Top cities in dataset by number of businesses:
Philadelphia: 14570
Tucson: 9252
Tampa: 9051
Indianapolis: 7543
Nashville: 6974
New Orleans: 6209
Reno: 5937
Edmonton: 5054
Saint Louis: 4828
Santa Barbara: 3834


In [20]:
selected_cities = ['Philadelphia', 'Tampa', 'Indianapolis', 'Nashville', 'New Orleans','Reno']
major_chains = ['Marriott', 'Hilton', 'Hyatt', 'Sheraton', 'Westin', 'DoubleTree', 'Holiday Inn']

chain_hotels = []
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city = business['city'].strip()
        name = business['name']
        categories = str(business.get('categories', ''))
        if city in selected_cities:
            if any(chain in name for chain in major_chains) or any(chain in categories for chain in major_chains):
                chain_hotels.append(business)

print(f"Total major chain hotels in selected cities: {len(chain_hotels)}")
print("Sample hotels found:", [hotel['name'] for hotel in chain_hotels[:10]])

Total major chain hotels in selected cities: 247
Sample hotels found: ['Courtyard by Marriott Nashville Downtown', 'Courtyard by Marriott Reno', 'Holiday Inn Nashville-Vanderbilt', 'DoubleTree Suites by Hilton Hotel Nashville Airport', 'Sheraton Indianapolis Hotel at Keystone Crossing', 'DoubleTree by Hilton Hotel Philadelphia Center City', 'Embassy Suites by Hilton Tampa Downtown Convention Center', 'Grand Hyatt Tampa Bay', 'Hilton Garden Inn Nashville Vanderbilt', 'Hyatt House Indianapolis/Downtown']


In [21]:
chain_hotels_df = pd.DataFrame(chain_hotels)
chain_hotel_ids = set(chain_hotels_df['business_id'])

In [22]:
hotel_reviews = []
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_review.json', encoding='utf-8') as f:
    for line in f:
        review = json.loads(line)
        if review['business_id'] in chain_hotel_ids:
            hotel_reviews.append(review)

print(f"Total reviews for major chain hotels: {len(hotel_reviews)}")

Total reviews for major chain hotels: 23810


In [23]:
# Convert to DataFrames
chain_hotels_df = pd.DataFrame(chain_hotels)
hotel_reviews_df = pd.DataFrame(hotel_reviews)

# Merge review and hotel info
df = hotel_reviews_df.merge(chain_hotels_df, on='business_id', suffixes=('_review', '_hotel'))
print("Merged shape:", df.shape)

Merged shape: (23810, 22)


In [24]:
# Step 1: Clean and engineer features
df_clean = clean_reviews(df)
df_clean = add_chain_flags(df_clean)
df_clean = add_category_flags(df_clean)

# Step 2: Sample up to 5,000 reviews per city
df_sampled = sample_reviews_per_city(df_clean, city_col='city', n_per_city=5000, random_state=42)

# Step 3: Save
df_sampled.to_csv('../Yelp-JSON/yelp_dataset/new_cities_cleaned.csv', index=False)
print(f"Final dataset shape: {df_sampled.shape}")

Final dataset shape: (22121, 35)
