In [None]:
import numpy as np

# Use keyword heuristics to define latent variables (instead of LDA)
import json
import pandas as pd

PRICE_KEYWORDS = {"expensive", "pricey", "affordable", "overpriced", "prices"}
FOOD_KEYWORDS = {"food", "dish", "meal", "taste", "flavor", "portion", "menu", "delicious"}
SERVICE_KEYWORDS = {"waiter", "waitress", "staff", "rude", "slow", "friendly", "host", "service"}
CLEANLINESS_KEYWORDS = {"clean", "dirty", "bathroom", "restroom", "table", "floor", "smell"}

def contains_keywords(text, keyword_set, min_hits=2):
    # returns True if text contains enough words within keyword_set
    text_lower = text.lower()
    return sum(1 for kw in keyword_set if kw in text_lower) >= min_hits

def extract_features(review_json):
    # preps review for panda dataframe
    text = review_json["text"]
    return {
        "price_mentioned": "yes" if contains_keywords(text, PRICE_KEYWORDS) else "no",
        "food_mentioned": "yes" if contains_keywords(text, FOOD_KEYWORDS) else "no",
        "service_mentioned": "yes" if contains_keywords(text, SERVICE_KEYWORDS) else "no",
        "cleanliness_mentioned": "yes" if contains_keywords(text, CLEANLINESS_KEYWORDS) else "no",
        "star_rating": float(review_json["stars"])
    }

def get_business_id(business_name, business_json_path="data/yelp_academic_dataset_business.json"):
    # Retrieves serial business ID and returns it
    with open(business_json_path, "r", encoding="utf-8") as f:
        for line in f:
            biz = json.loads(line)
            if biz["name"].lower() == business_name.lower():
                return biz["business_id"]
    raise ValueError(f"Business '{business_name}' not found in dataset.")

def load_reviews_df(business_name, reviews_path="sample_reviews.json", business_path="data/yelp_academic_dataset_business.json"):
    # Extracts all relevant reviews from business name parameter and consolidates it all into a panda dataframe
    business_id = get_business_id(business_name, business_path)

    reviews = []
    with open(reviews_path, "r", encoding="utf-8") as f:
        for line in f:
            review = json.loads(line)
            if review["business_id"] == business_id:
                reviews.append(extract_features(review))

    return pd.DataFrame(reviews)

def display_sample():
    with open("sample_reviews.json", "r", encoding="utf-8") as f:
        reviews = [json.loads(line) for line in f]  # fallback for line-delimited

    sample_biz_ids = {r["business_id"] for r in reviews}
    found_names = []

    with open("data/yelp_academic_dataset_business.json", "r", encoding="utf-8") as f:
        for line in f:
            biz = json.loads(line)
            if biz["business_id"] in sample_biz_ids:
                found_names.append((biz["name"], biz["city"], biz["state"]))

    for name, city, state in sorted(found_names):
        print(f"- {name} — {city}, {state}")


#display_sample()   
# Use load_reviews to create pandas dataframe for model training.
df = load_reviews_df("In-N-Out Burger", "data/yelp_academic_dataset_review.json")
print(df.head())
print(f"Parsed {len(df)} reviews for this business.")

  price_mentioned food_mentioned service_mentioned cleanliness_mentioned  \
0              no             no                no                    no   
1              no             no                no                    no   
2              no            yes                no                    no   
3              no             no               yes                   yes   
4              no             no                no                    no   

   star_rating  
0          5.0  
1          5.0  
2          5.0  
3          5.0  
4          4.0  
Parsed 230 reviews for this business.


In [None]:
# Use Bayes-Net model to determine variable with highest delta (makes the difference for review)