## Step 4: Topic Normalization & Deduplication

This step consolidates semantically similar topics into canonical
categories to avoid trend fragmentation.


In [14]:
TOPIC_RULES = {
    "delivery partner rude": ["rude", "impolite", "bad behavior", "misbehaved"],
    "delivery delay": ["late", "delay", "delayed"],
    "food quality issue": ["stale", "cold", "bad food", "spoiled", "worst food"],
    "customer support issue": ["customer service", "support", "help", "cannot help"],
    "refund issue": ["refund", "money not returned", "refund not received"],
    "instamart availability": ["instamart", "out of stock", "closed"],
    "app issue": ["app crash", "not working", "bug", "map", "tracking"]
}


In [16]:
def extract_topic_local(review):
    review = review.lower()

    for topic, keywords in TOPIC_RULES.items():
        for kw in keywords:
            if kw in review:
                return topic

    positive_words = ["good", "nice", "best", "great", "awesome", "excellent", "super"]
    if review.strip() in positive_words or any(pw in review for pw in positive_words):
        return "positive feedback"

    return "general feedback"


In [18]:
import pandas as pd

df = pd.read_csv("../data/swiggy_reviews_raw.csv")
df["review_date"] = pd.to_datetime(df["review_date"]).dt.date

daily_batches = {
    date: group["review_text"].tolist()
    for date, group in df.groupby("review_date")
}

all_rows = []

for date, reviews in daily_batches.items():
    for r in reviews:
        all_rows.append({
            "review_date": date,
            "review_text": r,
            "topic_candidate": extract_topic_local(r)
        })

understanding_df = pd.DataFrame(all_rows)
understanding_df.head()


Unnamed: 0,review_date,review_text,topic_candidate
0,2025-12-23,Excellent service üíØ,positive feedback
1,2025-12-23,super,positive feedback
2,2025-12-23,worst,general feedback
3,2025-12-23,‚ù§Ô∏è,general feedback
4,2025-12-23,good,positive feedback


In [20]:
CANONICAL_TOPICS = {
    "delivery partner rude",
    "delivery delay",
    "food quality issue",
    "customer support issue",
    "refund issue",
    "instamart availability",
    "app issue",
    "positive feedback",
    "general feedback"
}

def normalize_topic(topic):
    if topic in CANONICAL_TOPICS:
        return topic
    return "general feedback"

understanding_df["canonical_topic"] = understanding_df["topic_candidate"].apply(normalize_topic)
understanding_df.head()


Unnamed: 0,review_date,review_text,topic_candidate,canonical_topic
0,2025-12-23,Excellent service üíØ,positive feedback,positive feedback
1,2025-12-23,super,positive feedback,positive feedback
2,2025-12-23,worst,general feedback,general feedback
3,2025-12-23,‚ù§Ô∏è,general feedback,general feedback
4,2025-12-23,good,positive feedback,positive feedback
