In [2]:
import os

# Change this to your file ID
FILE_ID = "YOUR_FILE_ID"
URL = f"https://mcauleylab.ucsd.edu/public_datasets/gdrive/googlelocal_restaurants/filter_all_t.json"
OUT = "filter_all_t.json"

if not os.path.exists(OUT):
    print("Downloading dataset...")
    !wget --no-check-certificate "$URL" -O "$OUT"
else:
    print(f"{OUT} already exists, skip downloading.")

filter_all_t.json already exists, skip downloading.


In [3]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from surprise import Dataset, Reader, SVD
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

In [4]:
path = "filter_all_t.json"

with open(path, "r") as f:
    data = json.load(f)

df = pd.json_normalize(data)
train_raw = data["train"]
test_raw  = data["test"]
val_raw   = data["val"]

train_df = pd.DataFrame(train_raw)
test_df  = pd.DataFrame(test_raw)
val_df   = pd.DataFrame(val_raw)
train_df.head()


Unnamed: 0,business_id,user_id,rating,review_text,pics,history_reviews
0,60567465d335d0abfb415b26,101074926318992653684,4,The tang of the tomato sauce is outstanding. A...,"[AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0,...",[[101074926318992653684_6056272797d555cc6fb0d1...
1,6050fa9f5b4ccec8d5cae994,117065749986299237881,5,Chicken and waffles were really good!,[AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s],[[117065749986299237881_605206f8d8c08f462b93e8...
2,604be10877e81aaed3cc9a1e,106700937793048450809,4,The appetizer of colossal shrimp was very good...,"[AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8,...",[[106700937793048450809_6044300b27f39b7b5d1dbf...
3,60411e017cd8bf130362365a,101643045857250355161,5,The fish tacos here omg! The salad was great ...,"[AF1QipM-a6AGGp4Hgk5RD0gY5sDRp5kEfB1hZLvlRkft,...",[[101643045857250355161_604fbdd099686c10168c91...
4,604139dd7cd8bf1303624208,109802745326785766951,4,"Ribs are great, as are the mac and cheese, fri...",[AF1QipNVys4yq-5w_3EsDdHpSc9ZNb7Nl30Mfb6Y0Gup],[[109802745326785766951_60524fa9f09a4ffff042f9...


In [5]:
def extract_num_pics(entry):
    pics = entry.get("pics", [])
    if isinstance(pics, list):
        return len(pics)
    return 0


def convert_split(entries):
    grouped = defaultdict(list)

    for r in entries:
        uid = r["user_id"]

        grouped[uid].append({
            "business_id": r["business_id"],
            "rating": r["rating"],
            "review_text": r["review_text"],
            "num_pics": extract_num_pics(r),
        })

    return grouped


def main():
    with open("filter_all_t.json", "r", encoding="utf-8") as f:
        data = json.load(f)

    for split in ["train", "val", "test"]:
        grouped = convert_split(data[split])

        output_file = f"{split}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(grouped, f, indent=2, ensure_ascii=False)

        print(f"{split}: {len(data[split])} entries → {len(grouped)} users")
        print(f"Saved {output_file}")


if __name__ == "__main__":
    main()

train: 87013 entries → 29596 users
Saved train.json
val: 10860 entries → 3700 users
Saved val.json
test: 11015 entries → 3700 users
Saved test.json


In [28]:
def load_grouped_json(path):
    with open(path, "r") as f:
        data = json.load(f)

    rows = []
    for user, entries in data.items():
        for entry in entries:
            rows.append({
                "user_id": user,
                "business_id": entry["business_id"],
                "rating": entry["rating"],
                "review_text": entry["review_text"],
                "num_pics": entry["num_pics"],
            })

    return pd.DataFrame(rows)



train_df = load_grouped_json("train.json")
val_df   = load_grouped_json("val.json")
test_df  = load_grouped_json("test.json")

print(train_df.shape, val_df.shape, test_df.shape)
train_df.head()

(87013, 5) (10860, 5) (11015, 5)


Unnamed: 0,user_id,business_id,rating,review_text,num_pics
0,101074926318992653684,60567465d335d0abfb415b26,4,The tang of the tomato sauce is outstanding. A...,4
1,101074926318992653684,6056272797d555cc6fb0d147,5,"The pizza here is the real deal, perfect in ev...",2
2,101074926318992653684,604a65c2c6dc737bce7e5a3d,5,"Omg the tomato sauce is everything, in the mea...",6
3,101074926318992653684,60433b8d8be5d4454df9cc51,4,"First time around last year, we stuck to eggs ...",2
4,101074926318992653684,6055ebe4f69c7b117806fdaa,3,"Food was lukewarm, including fried calamari.",1


In [29]:
full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

full_df["review_text"] = full_df["review_text"].astype(str)
full_df["num_pics"] = pd.to_numeric(full_df["num_pics"], errors="coerce").fillna(0)


In [32]:
item_text_corpus = full_df.groupby("business_id")["review_text"].apply(lambda x: " ".join(x))

tfidf_item = TfidfVectorizer(stop_words="english", max_features=3000)
X_item = tfidf_item.fit_transform(item_text_corpus.values)

pca_item = PCA(n_components=50)
item_text_emb = pca_item.fit_transform(X_item.toarray())

item_text_emb = pd.DataFrame(item_text_emb, index=item_text_corpus.index)
print("Item text embedding:", item_text_emb.shape)

Item text embedding: (30831, 50)


In [34]:
user_text_corpus = full_df.groupby("user_id")["review_text"].apply(lambda x: " ".join(x))

tfidf_user = TfidfVectorizer(stop_words="english", max_features=3000)
X_user = tfidf_user.fit_transform(user_text_corpus.values)

pca_user = PCA(n_components=50)
user_text_emb = pca_user.fit_transform(X_user.toarray())

user_text_emb = pd.DataFrame(user_text_emb, index=user_text_corpus.index)
print("User text embedding:", user_text_emb.shape)


User text embedding: (36996, 50)


In [35]:
item_num_pics = full_df.groupby("business_id")["num_pics"].mean()

scaler = StandardScaler()
item_num_pics_scaled = scaler.fit_transform(item_num_pics.values.reshape(-1, 1))

item_num_pics_emb = pd.DataFrame(
    item_num_pics_scaled,
    index=item_num_pics.index,
    columns=["num_pics_feature"]
)

print("Item num_pics feature:", item_num_pics_emb.shape)


Item num_pics feature: (30831, 1)


In [36]:
from surprise import Dataset, Reader, SVD

reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(
    train_df[["user_id", "business_id", "rating"]],
    reader
)
trainset = surprise_data.build_full_trainset()

svd = SVD(
    n_factors=50,
    lr_all=0.005,
    reg_all=0.02,
    n_epochs=30,
)
svd.fit(trainset)
print("SVD training done.")


SVD training done.


In [37]:
user_latent = {}
for u in trainset.all_users():
    uid = trainset.to_raw_uid(u)
    user_latent[uid] = svd.pu[u]
user_latent = pd.DataFrame(user_latent).T

item_latent = {}
for i in trainset.all_items():
    iid = trainset.to_raw_iid(i)
    item_latent[iid] = svd.qi[i]
item_latent = pd.DataFrame(item_latent).T

print("User latent:", user_latent.shape)
print("Item latent:", item_latent.shape)


User latent: (29596, 50)
Item latent: (27896, 50)


In [46]:
user_hybrid = pd.concat([user_latent, user_text_emb], axis=1)
user_hybrid = user_hybrid.fillna(0)
user_hybrid["dummy_pic"] = 0.0
print("User hybrid:", user_hybrid.shape)

User hybrid: (36996, 101)


In [47]:
item_latent = item_latent.add_prefix("latent_")
item_text_emb = item_text_emb.add_prefix("text_")
item_num_pics_emb = item_num_pics_emb.add_prefix("pic_")
item_hybrid = pd.concat([
    item_latent,
    item_text_emb,
    item_num_pics_emb
], axis=1)

item_hybrid = item_hybrid.fillna(0)

print("Item hybrid:", item_hybrid.shape)


Item hybrid: (30831, 101)


In [48]:
def predict_rating(user, item):
    if user not in user_hybrid.index:
        return None
    if item not in item_hybrid.index:
        return None

    u = user_hybrid.loc[user].values
    v = item_hybrid.loc[item].values
    return float(np.dot(u, v))


In [53]:
def predict_rating_svd(user, item):
    try:
        return svd.predict(user, item).est
    except:
        return None


def eval_rmse_svd(df):
    preds, gts = [], []
    for _, row in df.iterrows():
        p = predict_rating_svd(row.user_id, row.business_id)
        if p is not None:
            preds.append(p)
            gts.append(row.rating)
    if len(preds) == 0:
        return None
    preds, gts = np.array(preds), np.array(gts)
    return np.sqrt(((preds - gts)**2).mean())


print("SVD Validation RMSE:", eval_rmse_svd(val_df))
print("SVD Test RMSE:", eval_rmse_svd(test_df))

SVD Validation RMSE: 0.8477666427398878
SVD Test RMSE: 0.8268605543821556


In [57]:
USER_LATENT_MATRIX = user_latent.values
USER_IDS = np.array(user_latent.index)

ITEM_LATENT_MATRIX = item_latent.values
ITEM_IDS = np.array(item_latent.index)

# user -> seen items (from train)
user_seen_items = (
    train_df.groupby("user_id")["business_id"].apply(set).to_dict()
)

def recommend_topk_cf(user_id, k=10, filter_seen=False):
    if user_id not in user_latent.index:
        return []

    u = user_latent.loc[user_id].values       # shape (d,)
    scores = ITEM_LATENT_MATRIX @ u

    if filter_seen:
        seen = user_seen_items.get(user_id, set())
        mask = ~np.isin(ITEM_IDS, list(seen))
        scores = scores[mask]
        items  = ITEM_IDS[mask]
    else:
        items = ITEM_IDS

    # 选出 top-k
    if len(scores) <= k:
        top_idx = np.argsort(-scores)
    else:
        top_idx = np.argpartition(-scores, k)[:k]

    top_items = [(items[i], scores[i]) for i in top_idx]
    top_items.sort(key=lambda x: x[1], reverse=True)
    return top_items

def recall_at_k_cf(df, k=10):
    recalls = []

    for user in df.user_id.unique():
        true_items = set(df[df.user_id == user].business_id)

        recs = recommend_topk_cf(user, k, filter_seen=False)
        rec_items = {i for i, _ in recs}

        if not true_items:
            continue

        recall = len(rec_items & true_items) / len(true_items)
        recalls.append(recall)

    return np.mean(recalls) if recalls else None


def ndcg_at_k_cf(df, k=10):
    ndcgs = []

    for user in df.user_id.unique():
        true_items = set(df[df.user_id == user].business_id)
        recs = recommend_topk_cf(user, k, filter_seen=False)
        rec_items = [i for i, _ in recs]

        if not true_items:
            continue

        dcg = 0.0
        for idx, item in enumerate(rec_items):
            if item in true_items:
                dcg += 1 / np.log2(idx + 2)

        idcg = 1.0
        ndcgs.append(dcg / idcg)

    return np.mean(ndcgs) if ndcgs else None

print("CF Recall@10 (val):", recall_at_k_cf(val_df, 10))
print("CF NDCG@10 (val):", ndcg_at_k_cf(val_df, 10))
print("CF Recall@10 (test):", recall_at_k_cf(test_df, 10))
print("CF NDCG@10 (test):", ndcg_at_k_cf(test_df, 10))



CF Recall@10 (val): 0.0
CF NDCG@10 (val): 0.0
CF Recall@10 (test): 0.0
CF NDCG@10 (test): 0.0


In [60]:
train_items = set(train_df.business_id)
val_items = set(val_df.business_id)
test_items = set(test_df.business_id)

val_unseen = val_items - train_items
test_unseen = test_items - train_items

print("Total train items:", len(train_items))
print("Total val items:", len(val_items))
print("Total test items:", len(test_items))

print("Val unseen rate:", len(val_unseen) / len(val_items))
print("Test unseen rate:", len(test_unseen) / len(test_items))

train_user_counts = train_df.groupby("user_id").size()
val_user_counts   = val_df.groupby("user_id").size()
test_user_counts  = test_df.groupby("user_id").size()

print("Train interactions per user:\n", train_user_counts.describe())
print("Val interactions per user:\n",   val_user_counts.describe())
print("Test interactions per user:\n",  test_user_counts.describe())

test_per_user = test_df.groupby("user_id").size()
print(test_per_user.value_counts().sort_index())

item_popularity = train_df.groupby("business_id").size()
print(item_popularity.describe())

print("Items with only 1 interaction:", (item_popularity == 1).sum())
print("Items with only ≤2 interactions:", (item_popularity <= 2).sum())
print("Total items:", len(item_popularity))

print("Train/Val item overlap:", len(train_items & val_items) / len(val_items))
print("Train/Test item overlap:", len(train_items & test_items) / len(test_items))

missing_latent_items = [iid for iid in test_items if iid not in item_latent.index]
print("Test items without latent vector:", len(missing_latent_items))
print("Rate:", len(missing_latent_items) / len(test_items))

sample_user = test_df.user_id.iloc[0]
u_train_items = set(train_df[train_df.user_id==sample_user].business_id)
u_test_items  = set(test_df[test_df.user_id==sample_user].business_id)

print("Train items:", u_train_items)
print("Test items:", u_test_items)
print("Overlap:", u_train_items & u_test_items)

unique_per_user = full_df.groupby("user_id").business_id.nunique()
print(unique_per_user.describe())


Total train items: 27896
Total val items: 7835
Total test items: 7880
Val unseen rate: 0.19821314613911933
Test unseen rate: 0.19771573604060913
Train interactions per user:
 count    29596.000000
mean         2.940026
std          2.071721
min          2.000000
25%          2.000000
50%          2.000000
75%          3.000000
max         46.000000
dtype: float64
Val interactions per user:
 count    3700.000000
mean        2.935135
std         1.910998
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        22.000000
dtype: float64
Test interactions per user:
 count    3700.000000
mean        2.977027
std         2.057635
min         2.000000
25%         2.000000
50%         2.000000
75%         3.000000
max        31.000000
dtype: float64
2     2257
3      690
4      311
5      177
6       84
7       60
8       27
9       30
10      15
11       8
12       9
13       6
14       3
15      10
16       1
17       1
18       1
19       3
20       1
21