In [None]:
import os

# Change this to your file ID
FILE_ID = "YOUR_FILE_ID"
URL = f"https://cseweb.ucsd.edu/~jmcauley/datasets.html#google_restaurants"
OUT = "filter_all_t.json"

if not os.path.exists(OUT):
    print("Downloading dataset...")
    !wget --no-check-certificate "$URL" -O "$OUT"
else:
    print(f"{OUT} already exists, skip downloading.")

In [None]:
import os
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score
)
from sklearn.ensemble import RandomForestClassifier

sns.set(style="whitegrid")

In [None]:
with open(OUT, "r") as f:
    data = json.load(f)

df = pd.json_normalize(data, max_level=2)
print("Data loaded:", df.shape)
df.head()

In [None]:
df["review_text"] = df["review_text"].astype(str)

# Basic text cleaning
def clean_text(t):
    t = t.lower()
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df["review_text"].apply(clean_text)

# Create binary label: good review if rating >= 4
df["label_good"] = (df["rating"] >= 4).astype(int)

# Add simple features
df["text_len"] = df["clean_text"].apply(len)
df["word_count"] = df["clean_text"].apply(lambda x: len(x.split()))

df[["rating", "label_good", "text_len", "word_count"]].describe()



In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x=df["rating"])
plt.title("Rating Distribution")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df["text_len"], bins=50, kde=True)
plt.title("Text Length Distribution")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(df["word_count"], bins=50, kde=True)
plt.title("Word Count Distribution")
plt.show()

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=8000,
    ngram_range=(1, 2),
)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label_good"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)

In [None]:
log_reg = LogisticRegression(max_iter=300)
log_reg.fit(X_train, y_train)
pred_lr = log_reg.predict(X_test)

print("\n=== Logistic Regression Performance ===")
print(classification_report(y_test, pred_lr))

acc_lr = accuracy_score(y_test, pred_lr)
f1_lr = f1_score(y_test, pred_lr)
print(f"Accuracy: {acc_lr:.4f}, F1: {f1_lr:.4f}")



In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("\n=== Random Forest Performance ===")
print(classification_report(y_test, pred_rf))

acc_rf = accuracy_score(y_test, pred_rf)
f1_rf = f1_score(y_test, pred_rf)
print(f"Accuracy: {acc_rf:.4f}, F1: {f1_rf:.4f}")

In [None]:
summary = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "Accuracy": [acc_lr, acc_rf],
    "F1 Score": [f1_lr, f1_rf],
})
summary

In [None]:
def plot_confusion(cm, title):
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

plot_confusion(confusion_matrix(y_test, pred_lr), "LR Confusion Matrix")
plot_confusion(confusion_matrix(y_test, pred_rf), "RF Confusion Matrix")


In [None]:
feature_names = vectorizer.get_feature_names_out()
coefs = log_reg.coef_[0]

top_pos_idx = np.argsort(coefs)[-15:][::-1]
top_neg_idx = np.argsort(coefs)[:15]

print("\nTop positive tokens:")
for i in top_pos_idx:
    print(f"{feature_names[i]:20s}  {coefs[i]:.4f}")

print("\nTop negative tokens:")
for i in top_neg_idx:
    print(f"{feature_names[i]:20s}  {coefs[i]:.4f}")

