In [None]:
from pathlib import Path
import pandas as pd
from collections import Counter

# Paths
DATA_PATH = Path('data/yelp_reviews.csv')
OUT_DIR = Path('outputs')
OUT_DIR.mkdir(exist_ok=True)

assert DATA_PATH.exists(), 'Put your Yelp CSV at data/yelp_reviews.csv'
print('CSV:', DATA_PATH)

In [None]:
import re, math

def _tok(x: str):
    return re.findall(r"[a-zA-Z][a-zA-Z']+", str(x).lower())

POS = set("good great excellent amazing awesome friendly fast quick tasty delicious fresh clean spotless helpful courteous love lovely perfect outstanding best recommend fantastic".split())
NEG = set("bad poor terrible awful rude slow bland cold overcooked undercooked dirty messy greasy smelly wait long disappointed worst never".split())

def load_yelp_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    name_col = next((c for c in ["restaurant","name","business","business_name","restaurant_name"] if c in df.columns), df.columns[0])
    rev_col  = next((c for c in ["review","text","body","comment","reviews"] if c in df.columns), df.columns[-1])
    df = df[[name_col, rev_col]].rename(columns={name_col:"restaurant", rev_col:"review"})
    df["restaurant"] = df["restaurant"].astype(str).str.strip()
    df["review"] = df["review"].astype(str).str.replace(r"\\s+"," ", regex=True).str.strip()
    df = df[(df["restaurant"]!="") & (df["review"]!="")].drop_duplicates()
    return df

def build_synonyms(custom=None):
    base = {
        "service": ["service","server","staff","waiter","waitress","host","friendly","rude","attentive","quick","slow","speed","helpful","courteous"],
        "food": ["food","taste","flavor","delicious","tasty","bland","overcooked","undercooked","fresh","portion","menu","dish","burger","pizza","taco","fries","salad","sauce"],
        "cleanliness": ["clean","dirty","messy","sanitary","hygiene","spotless","sticky","tidy","restroom","bathroom","trash","smell","odor","greasy"],
        "location": ["location","parking","lot","easy","nearby","close","downtown","drive-thru","drive through","drive thru","line","wait","busy","crowded","find","distance"],
    }
    if custom:
        for k,v in custom.items():
            if k in base and isinstance(v, list) and v:
                base[k] = v
    return base

def frequency_after_merge(df, synonyms):
    vocab_to_attr = {w: a for a, ws in synonyms.items() for w in ws}
    rows = []
    from collections import Counter
    buckets = {k: Counter() for k in synonyms}
    for txt in df["review"]:
        for w in _tok(txt):
            a = vocab_to_attr.get(w)
            if a:
                buckets[a][w] += 1
    for a, cnt in buckets.items():
        tot = sum(cnt.values())
        for w, n in cnt.most_common():
            rows.append({"attribute": a, "word": w, "count": int(n), "attribute_total": int(tot)})
    out = pd.DataFrame(rows).sort_values(["attribute","count"], ascending=[True, False], ignore_index=True)
    return out

def select_top_by_cosine(df, synonyms, top_n=200):
    combined = " ".join(" ".join(ws) for ws in synonyms.values())
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
        vec = TfidfVectorizer(lowercase=True, token_pattern=r"[a-zA-Z][a-zA-Z']+")
        X = vec.fit_transform([combined] + df["review"].tolist())
        sims = cosine_similarity(X[0:1], X[1:]).flatten()
    except Exception:
        from collections import Counter
        def tfv(t): return Counter(_tok(t))
        a = tfv(combined)
        def cos(x,y):
            ks = set(x)|set(y)
            vx = [x.get(k,0) for k in ks]; vy = [y.get(k,0) for k in ks]
            nx = (sum(u*u for u in vx) or 1) ** 0.5
            ny = (sum(v*v for v in vy) or 1) ** 0.5
            return sum(u*v for u,v in zip(vx,vy))/(nx*ny)
        sims = [cos(a, tfv(t)) for t in df["review"]]
    out = df.copy()
    out["cosine_similarity"] = sims
    return out.sort_values("cosine_similarity", ascending=False).head(top_n).reset_index(drop=True)

def sentiment_score(text):
    toks = _tok(text)
    if not toks: return 0.0
    s = sum(1 if t in POS else -1 if t in NEG else 0 for t in toks)
    return s/len(toks)

def recommend_top3(top_df):
    tmp = top_df.copy()
    tmp["sentiment"] = tmp["review"].apply(sentiment_score)
    agg = tmp.groupby("restaurant", as_index=False)["sentiment"].mean().rename(columns={"sentiment":"avg_sentiment"})
    return agg.sort_values("avg_sentiment", ascending=False).head(3).reset_index(drop=True)

In [None]:
df = load_yelp_csv(str(DATA_PATH))
print('Rows:', len(df), '| Unique restaurants:', df['restaurant'].nunique())
df.head(3)

In [None]:
syn = build_synonyms()
freq = frequency_after_merge(df, syn)
freq.head(10)

In [None]:
TOP_N = 200
selected = select_top_by_cosine(df, syn, top_n=TOP_N)
selected[['restaurant','cosine_similarity']].head(10)

In [None]:
selected_sent = selected.copy()
selected_sent['sentiment'] = selected_sent['review'].apply(sentiment_score)
selected_sent[['restaurant','sentiment']].head(10)

In [None]:
top3 = recommend_top3(selected)
top3

In [None]:
freq.to_csv('outputs/frequency_table.csv', index=False)
selected.to_csv('outputs/top200_reviews.csv', index=False)
selected_sent.to_csv('outputs/sentiment_top200.csv', index=False)
top3.to_csv('outputs/recommendations_top3.csv', index=False)
print('Saved to outputs/')

## Create files for the Streamlit app

In [None]:
from pathlib import Path
p_src = Path('src'); p_app = Path('app'); p_src.mkdir(exist_ok=True); p_app.mkdir(exist_ok=True)
# Write pipeline file
pipeline_code = open('/mnt/data/yelp_hw_project/src/yelp_pipeline.py','r').read()
open('src/yelp_pipeline.py','w').write(pipeline_code)
# Write app.py
app_code = open('/mnt/data/yelp_hw_project/app/app.py','r').read()
open('app/app.py','w').write(app_code)
print('Wrote src/yelp_pipeline.py and app/app.py')