In [None]:
# CELL 1 — Mount & load dataset, clean text, save review_index.csv
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import pandas as pd
import re
from pathlib import Path

ROOT = Path("/content/drive/MyDrive/afml_project")
OUT_DIR = ROOT / "bert"
OUT_DIR.mkdir(parents=True, exist_ok=True)

CSV_PATH = ROOT / "TRAIN_model_ready_final_mode.csv"   # change if needed
# safe dtypes
dtype_map = {
    "review_id": "string",
    "user_id": "string",
    "text": "string",
    "gmap_id": "string",
    "rating": "float64",
    "datetime": "string"
}

df = pd.read_csv(CSV_PATH, dtype=dtype_map, low_memory=False)

# ensure review_id exists as string
if "review_id" not in df.columns or df["review_id"].isna().all():
    df["review_id"] = df.index.astype(str)
else:
    df["review_id"] = df["review_id"].astype("string")

# parse datetime if present
if "datetime" in df.columns:
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
else:
    df["datetime"] = pd.NaT

# minimal text cleaning
def clean_text(s):
    if s is None:
        return ""
    if isinstance(s, float) and pd.isna(s):
        return ""
    t = str(s).strip()
    t = re.sub(r"\s+", " ", t)
    return t

df["text"] = df["text"].apply(clean_text)
df = df[df["text"].str.len() > 0].reset_index(drop=True)

# save mapping (include datetime if present)
cols = ["review_id", "user_id", "gmap_id", "rating", "datetime"]
cols = [c for c in cols if c in df.columns]
df_index = df[cols].copy()
df_index.to_csv(OUT_DIR / "review_index.csv", index=False)

print("Loaded rows:", len(df))
print("Saved review_index.csv to:", OUT_DIR / "review_index.csv")

In [None]:
# CELL 2 — Compute SBERT embeddings (all-MiniLM-L6-v2) and save to embeddings.npy
# If you already have model loaded in session, it's OK to run just this cell.
from sentence_transformers import SentenceTransformer
import numpy as np
from pathlib import Path

OUT_DIR = Path("/content/drive/MyDrive/afml_project/bert")

# load model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model.max_seq_length = 256

# load texts from df or review_index file
import pandas as pd
df_index = pd.read_csv(OUT_DIR / "review_index.csv", dtype=str)
# if you have original df in memory, you can use that instead; otherwise load text from the original CSV
# Here we reload the original CSV to get text column (keeps cells independent)
ROOT = Path("/content/drive/MyDrive/afml_project")
CSV_PATH = ROOT / "TRAIN_model_ready_final_mode.csv"
df_full = pd.read_csv(CSV_PATH, dtype=str, low_memory=False)
# Align by review_id to ensure same order
df_full["review_id"] = df_full["review_id"].astype(str)
df_index["review_id"] = df_index["review_id"].astype(str)
merged = df_index.merge(df_full[["review_id","text"]], on="review_id", how="left")
texts = merged["text"].fillna("").tolist()

N = len(texts)
batch_size = 512 if N <= 50000 else 256 if N <= 200000 else 128
print(f"Encoding {N} reviews with batch_size={batch_size} ...")

embeddings = model.encode(
    texts,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

np.save(OUT_DIR / "embeddings.npy", embeddings)
print("Saved embeddings.npy with shape:", embeddings.shape)

In [None]:
# CELL 3 — Build clusters by gmap_id (deterministic: one cluster per business)
import pandas as pd
from pathlib import Path

OUT_DIR = Path("/content/drive/MyDrive/afml_project/bert")

index_df = pd.read_csv(OUT_DIR / "review_index.csv", dtype=str).reset_index().rename(columns={"index":"review_index"})

# Normalize gmap_id (treat blanks/NaN as missing)
if "gmap_id" in index_df.columns:
    index_df["gmap_id"] = index_df["gmap_id"].replace({"": None}).where(pd.notna(index_df["gmap_id"]), None)
    unique_gmaps = index_df["gmap_id"].dropna().unique().tolist()
    gmap_to_label = {g: i for i, g in enumerate(unique_gmaps)}
    cluster_sizes = index_df.groupby("gmap_id", dropna=False).size().to_dict()

    cluster_labels = []
    cluster_probs = []
    cluster_size_list = []
    for _, r in index_df.iterrows():
        g = r.get("gmap_id", None)
        if pd.isna(g) or g is None:
            cluster_labels.append(-1)
            cluster_probs.append(0.0)
            cluster_size_list.append(1)
        else:
            cluster_labels.append(int(gmap_to_label[g]))
            cluster_probs.append(1.0)
            cluster_size_list.append(int(cluster_sizes.get(g, 1)))
else:
    # no gmap_id column: mark all as outliers
    cluster_labels = [-1] * len(index_df)
    cluster_probs = [0.0] * len(index_df)
    cluster_size_list = [1] * len(index_df)

clusters_by_gmap = pd.DataFrame({
    "review_index": index_df["review_index"].astype(int),
    "cluster_label": cluster_labels,
    "cluster_prob": cluster_probs,
    "cluster_size": cluster_size_list
})

clusters_by_gmap.to_csv(OUT_DIR / "clusters_by_gmap.csv", index=False)
print("Saved clusters_by_gmap.csv to:", OUT_DIR / "clusters_by_gmap.csv")
print("Sample:")
print(clusters_by_gmap.head(8))