In [None]:
import os
import sys

src_path = r"D:\SEM 4\CS516\Yelp Fairness Review\scripts"
if src_path not in sys.path:
    sys.path.append(src_path)

from preprocessing import (
    load_businesses, flag_major_chains,
    load_reviews_for_businesses, merge_reviews_businesses,
    sample_reviews_per_city, add_category_flags, clean_reviews
)

In [3]:

#parameters
business_path = r'../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json'
review_path = r'../Yelp-JSON/yelp_dataset/yelp_academic_dataset_review.json'
target_cities = ['Las Vegas', 'Toronto', 'Phoenix', 'Charlotte', 'Edinburgh','Pittsburgh']
hospitality_keywords = ['Hotel', 'Resort', 'Motel', 'Inn', 'Lodge', 'Hostel', 'Spa', 'Restaurant']
major_chains = ['Marriott', 'Hilton', 'Hyatt', 'IHG', 'Holiday Inn', 'Sheraton', 'Ritz', 'Westin', 'DoubleTree']


In [4]:
# Load and filter businesses
business_df = load_businesses(business_path, target_cities, hospitality_keywords, major_chains)
business_df = flag_major_chains(business_df, major_chains)

# Load reviews
valid_business_ids = set(business_df['business_id'])
review_df = load_reviews_for_businesses(review_path, valid_business_ids)

In [5]:
# Merge
df = merge_reviews_businesses(review_df, business_df)

# Sample per city to ensure no imbalance in data
df_sampled = sample_reviews_per_city(df, city_col='city', n_per_city=5000, random_state=42)

df_sampled = clean_reviews(df_sampled)
df_sampled = add_category_flags(df_sampled)

# Save prepared data
df_sampled.to_csv('../Yelp-JSON/yelp_dataset/prepared_hospitality_reviews.csv', index=False)

In [11]:
import json
import pandas as pd
businesses = []
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i > 10000:  # Just load first 10k rows for a quick look
            break
        businesses.append(json.loads(line))
raw_business_df = pd.DataFrame(businesses)
print("Unique cities in sample:", raw_business_df['city'].unique())

Unique cities in sample: ['Santa Barbara' 'Affton' 'Tucson' 'Philadelphia' 'Green Lane'
 'Ashland City' 'Brentwood' 'St. Petersburg' 'Nashville' "Land O' Lakes"
 'Tampa Bay' 'Indianapolis' 'Clearwater' 'Largo' 'New Orleans' 'Kenner'
 'Edmonton' 'Reno' 'Newtown' 'White House' 'Boise' 'Paoli' 'Ardmore'
 'Exton' 'Wilmington' 'Edwardsville' 'Sparks' 'Alton' 'Cherry Hill'
 'Bala Cynwyd' 'Springfield' 'Belleville' 'Carmel' 'Tampa'
 'Kennett Square' 'Plymouth Meeting' 'Harvey' 'West Chester' 'Meridian'
 'Hudson' 'Fernley' 'Williamstown' 'Pinellas Park' 'Glenolden'
 'Wesley Chapel' 'Fishers' 'Burlington' 'Troy' 'Camden' 'Plainfield'
 'Bensalem' 'Maplewood' 'Saint Louis' 'Fairview Heights' 'Oro Valley'
 'Treasure Island' 'Southampton' 'Chalfont' 'Willow Grove' 'Voorhees'
 'Tarpon Springs' 'Blue Bell' 'Metairie' 'Woodbury' 'Brownsburg'
 'Norristown' 'Land O Lakes' 'Greenwood' 'Saint Petersburg' 'Brookhaven'
 'Haverford' 'Glenside' 'Moorestown' 'Madison' 'Ewing' 'Levittown'
 'Gloucester Township'

In [12]:
print("Unique categories in sample:", raw_business_df['categories'].unique())


Unique categories in sample: ['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists'
 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services'
 'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores'
 ... 'Bars, Nightlife, Airport Lounges'
 'Professional Services, Local Services, Printing Services, Graphic Design'
 'Burgers, Ice Cream & Frozen Yogurt, Restaurants, Food, Fast Food']


In [14]:
import pandas as pd
import json
from collections import Counter

city_counter = Counter()
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city_counter[business['city']] += 1

# Check counts for your target cities
target_cities = ['Las Vegas', 'Toronto', 'Phoenix', 'Charlotte', 'Edinburgh', 'Pittsburgh']
for city in target_cities:
    print(f"{city}: {city_counter[city]}")


Las Vegas: 0
Toronto: 0
Phoenix: 0
Charlotte: 1
Edinburgh: 0
Pittsburgh: 0


In [15]:
target_cities = [c.lower() for c in ['Las Vegas', 'Toronto', 'Phoenix', 'Charlotte', 'Edinburgh', 'Pittsburgh']]
businesses = []
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city_clean = business['city'].strip().lower()
        if city_clean in target_cities:
            businesses.append(business)
print(f"Businesses found in target cities: {len(businesses)}")


Businesses found in target cities: 1


In [17]:
with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for i, line in enumerate(f):
        pass
print(f"Total businesses: {i+1}")


Total businesses: 150346


In [18]:
target_substrings = ['las vegas', 'toronto', 'phoenix', 'charlotte', 'edinburgh', 'pittsburgh']
city_counter = {}

with open('../Yelp-JSON/yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        city_lower = business['city'].strip().lower()
        for t in target_substrings:
            if t in city_lower:
                city_counter.setdefault(city_lower, 0)
                city_counter[city_lower] += 1

print("Cities with matching substrings and their counts:")
for k, v in city_counter.items():
    print(f"{k}: {v}")


Cities with matching substrings and their counts:
phoenixville: 366
charlotte: 1
