In [7]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("/Users/sajanshergill/Machine-Learning-Projects/Trust-Graph")
sys.path.insert(0, str(PROJECT_ROOT))


In [8]:
from pathlib import Path
import sys
import importlib

PROJECT_ROOT = Path("/Users/sajanshergill/Machine-Learning-Projects/Trust-Graph")
sys.path.insert(0, str(PROJECT_ROOT))

from src.config import Config
from src.data_loader import load_yelp_minimal

cfg = Config(
    data_dir=Path("/Users/sajanshergill/Machine-Learning-Projects/Trust-Graph/data/yelp"),
    max_reviews=50_000,
    seed=42,
    burst_window_days=14
)

reviews, users, businesses = load_yelp_minimal(
    data_dir=cfg.data_dir,
    review_file="yelp_academic_dataset_review.json",
    user_file="yelp_academic_dataset_user.json",
    business_file="yelp_academic_dataset_business.json",
    max_reviews=cfg.max_reviews,
    seed=cfg.seed,
    keep_text=False
)

print("Reviews loaded:", reviews.shape)


Reading yelp_academic_dataset_review.json: 6990280it [00:40, 170924.16it/s]
Reading yelp_academic_dataset_user.json: 1987897it [00:18, 104765.54it/s]
Reading yelp_academic_dataset_business.json: 150346it [00:02, 60421.42it/s]


Reviews loaded: (50000, 5)


In [13]:
import pandas as pd


def business_burst_score(
    reviews: pd.DataFrame,
    window_days: int = 14
) -> pd.DataFrame:
    """
    BurstScore = (reviews in last window / window_days)
                 --------------------------------------
                 (overall avg reviews per day)

    Computed per business.
    """
    df = reviews[["business_id", "date"]].copy()

    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna()

    max_date = df["date"].max()
    window_start = max_date - pd.Timedelta(days=window_days)

    recent = df[df["date"] >= window_start]
    recent_counts = recent.groupby("business_id").size()

    total_counts = df.groupby("business_id").size()
    total_days = (df.groupby("business_id")["date"].max()
                  - df.groupby("business_id")["date"].min()).dt.days + 1

    avg_per_day = total_counts / total_days

    burst_score = (recent_counts / window_days) / avg_per_day
    burst_score = burst_score.replace([float("inf"), -float("inf")], 0).fillna(0)

    return (
        burst_score
        .reset_index(name="burst_score")
        .sort_values("burst_score", ascending=False)
    )


In [19]:
import importlib, src.signals
importlib.reload(src.signals)


<module 'src.signals' from '/Users/sajanshergill/Machine-Learning-Projects/Trust-Graph/src/signals.py'>

In [20]:
from src.signals import business_burst_score, business_rating_skew, business_reviewer_overlap

burst_df = business_burst_score(reviews, window_days=14)
skew_df = business_rating_skew(reviews)
overlap_df = business_reviewer_overlap(reviews, heavy_reviewer_threshold=10)

burst_df.head(), skew_df.head(), overlap_df.head()


(                  business_id  total_reviews  window_reviews  active_days  \
 10370  LNvsrgojDTFz1IJXjS0-ng              2             1.0         3856   
 6053   C3RFTpNj8_4Yz9gtmGeH5Q              2             1.0         3480   
 10161  KwADSZGoeiqnlR4vUpNhig              3             1.0         4034   
 8112   GVtJD93hEh7pQD29bmyTQA              2             1.0         2505   
 5041   9u90_FSluaf9qwbw5WF6TA              2             1.0         2487   
 
        window_rate  overall_rate  burst_score  
 10370     0.071429      0.000519   137.714286  
 6053      0.071429      0.000575   124.285714  
 10161     0.071429      0.000744    96.047619  
 8112      0.071429      0.000798    89.464286  
 5041      0.071429      0.000804    88.821429  ,
                   business_id  pct_5star  n_reviews  global_pct_5star  \
 29644  zzjFdJwXuxBOGe9JeY_EMw        1.0          1           0.46318   
 17894  acT5Sg26TJrs7E0yvORY-w        1.0          1           0.46318   
 17927  agcC9