In [1]:
import pandas as pd
data_path = "swiggy.csv"
df = pd.read_csv(data_path)

df['review_date'] = pd.to_datetime(df['review_date'])  # ensure it's datetime
df = df.sort_values(by='review_date', ascending=False)
# Step 1: Convert review_date to datetime
df['review_date'] = pd.to_datetime(df['review_date'])

# Step 2: Extract only the date part
df['date'] = df['review_date'].dt.date


In [2]:
df = df[['date','review_description','rating']]

In [3]:
df.head()

Unnamed: 0,date,review_description,rating
59992,2023-07-24,Very fantastic aap,5
3313,2023-07-24,Wonderful discounts & cheap and best and user ...,5
3703,2023-07-24,This delivery app makes easy for us to order i...,5
5508,2023-07-24,Only half order sent never order from this Dom...,1
62135,2023-07-24,"Fedup of message: ""Your order is picked up wit...",1


In [4]:
# end_date = pd.to_datetime('2022-07-31').date()
# start_date = end_date - pd.Timedelta(days=30)

# df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]


In [5]:
# df_filtered.head()

In [6]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='tqdm')

for i in tqdm(range(100)):
    pass



100%|██████████| 100/100 [00:00<00:00, 190736.88it/s]


In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from collections import defaultdict, deque
import numpy as np

# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer("all-mpnet-base-v2")

class TopicLimiterReviewTrendAgent:
    def __init__(self, seed_topics, similarity_threshold=0.75, window_days=31, max_topics=20):
        self.similarity_threshold = similarity_threshold
        self.window_days = window_days
        self.max_topics = max_topics
        self.topic_phrases = {}
        self.topic_centroids = {}
        self.topic_counter = 0
        for topic, phrases in seed_topics.items():
            self.topic_phrases[topic] = phrases.copy()
            embeddings = model.encode(phrases)
            self.topic_centroids[topic] = np.mean(embeddings, axis=0)
        self.topics = list(seed_topics.keys())
        self.topic_history = defaultdict(lambda: defaultdict(int))
        self.dates_queue = deque(maxlen=self.window_days)
    
    def assign_topic(self, review_embedding):
        centroids = np.vstack([self.topic_centroids[t] for t in self.topics])
        sims = cosine_similarity([review_embedding], centroids)[0]
        max_idx, max_sim = np.argmax(sims), np.max(sims)
        if max_sim >= self.similarity_threshold:
            return self.topics[max_idx], max_sim
        else:
            return None, max_sim

    def merge_closest_topics(self):
        # Find pair with max centroid similarity among discovered topics (excluding seed topics if desired)
        topics_to_consider = [t for t in self.topics if t not in self.topic_phrases or t.startswith("new_topic")]
        if len(topics_to_consider) < 2:
            return  # Nothing to merge
        centroids = [self.topic_centroids[t] for t in topics_to_consider]
        sims = cosine_similarity(centroids)
        np.fill_diagonal(sims, -1)  # Don't compare topic to itself
        idx1, idx2 = np.unravel_index(np.argmax(sims), sims.shape)
        topic1, topic2 = topics_to_consider[idx1], topics_to_consider[idx2]
        # Merge topic2 into topic1
        self.topic_phrases[topic1].extend(self.topic_phrases[topic2])
        self.update_centroid(topic1)
        for date, count in self.topic_history[topic2].items():
            self.topic_history[topic1][date] += count
        del self.topic_phrases[topic2]
        del self.topic_centroids[topic2]
        del self.topic_history[topic2]
        self.topics.remove(topic2)
    
    def process_daily_batch(self, date, reviews):
        self.dates_queue.append(date)
        for review in reviews:
            emb = model.encode(review)
            assigned_topic, sim = self.assign_topic(emb)
            if assigned_topic is not None:
                self.topic_phrases[assigned_topic].append(review)
                self.update_centroid(assigned_topic)
                self.topic_history[assigned_topic][date] += 1
            else:
                # Create new topic only if under limit, otherwise merge first
                if len(self.topics) >= self.max_topics:
                    self.merge_closest_topics()
                self.topic_counter += 1
                snippet = " ".join(review.split()[:4])
                new_topic = f"new_topic {self.topic_counter}: {snippet}"
                self.topics.append(new_topic)
                self.topic_phrases[new_topic] = [review]
                self.topic_centroids[new_topic] = emb
                self.topic_history[new_topic][date] = 1
        # Purge rolling window
        cutoff_dates = set(self.dates_queue)
        for topic in self.topic_history:
            for d in list(self.topic_history[topic].keys()):
                if d not in cutoff_dates:
                    del self.topic_history[topic][d]
    
    def update_centroid(self, topic):
        embeddings = model.encode(self.topic_phrases[topic])
        self.topic_centroids[topic] = np.mean(embeddings, axis=0)
    
    def get_trend_report(self, target_date):
        target_date = pd.to_datetime(target_date).date()
        dates_window = [target_date - pd.Timedelta(days=i) for i in reversed(range(self.window_days))]
        data = []
        for topic in self.topics:
            row = [self.topic_history[topic].get(d, 0) for d in dates_window]
            data.append([topic] + row)
        columns = ['Topic'] + [str(d) for d in dates_window]
        df_trend = pd.DataFrame(data, columns=columns).set_index('Topic')
        return df_trend


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
seed_topics = {
    "Delivery issue": ["Delivery delayed", "Order late", "Not delivered", "Order cancelled"],
    "Food stale": ["Stale food", "Food not fresh", "Spoiled food"],
    "Delivery partner rude": ["Delivery guy was rude", "Delivery partner misbehaved", "Delivery person was impolite"],
    "Maps not working properly": ["Maps issue", "Map not loading", "GPS problem"],
    "Instamart should be open all night": ["Instamart should be open all night", "Keep Instamart open always"],
    "Bring back 10 minute bolt delivery": ["Bring back 10 minute bolt delivery", "Miss 10 min orders"],
    "Payment issue": ["Payment failed", "Couldn't pay", "Payment not accepted"],
    "Customer support unresponsive": ["Customer support not responding", "No help from support"],
}

agent = TopicLimiterReviewTrendAgent(
    seed_topics=seed_topics,
    similarity_threshold=0.7)

In [9]:
import datetime

In [10]:
start_date = df['date'].min()
end_date = df['date'].max()

print("Start Date:", start_date)
print("End Date:", end_date)

Start Date: 2018-09-12
End Date: 2023-07-24


In [28]:
end_date = pd.to_datetime('2022-08-31').date()
start_date = end_date - pd.Timedelta(days=30)

df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

In [29]:
start_date = df_filtered['date'].min()
end_date = df_filtered['date'].max()

print("Start Date:", start_date)
print("End Date:", end_date)

Start Date: 2022-08-01
End Date: 2022-08-31


In [30]:
df_2 = df_filtered.copy()

In [31]:
df_2.shape

(2216, 3)

In [18]:
df3=df_2

In [19]:
df3.shape

(215, 3)

In [20]:
daily_data = df3.groupby('date')['review_description'].apply(list).to_dict()

In [21]:
pd.set_option("display.max_columns",None)

In [22]:
daily_data.keys()

dict_keys([datetime.date(2022, 7, 29), datetime.date(2022, 7, 30), datetime.date(2022, 7, 31)])

In [23]:
print(daily_data[datetime.date(2022, 7, 29)])

["It's quite annoying when it refreshes to the start while you're scrolling. If the store has closed, it can say so when checked, doesn't need me to be thrown back to top, to have to scroll all the way down to continue.", 'This is my very first purchase from Swiggy and I am absolutely loving it, Mr.Amardeep from the chat team did a great job explaining me about the procedure that I asked for, kudos to him & the Chat team😇😇 and Mr.Shiva, the delivery executive, he was being very patient and supportive,Happy to have got such services, Thanks Swiggy🥰🥰', "third class service by swiggy.. my order cancelled with out my consent. The cashback i have received for amazon pay is also gone as swiggy cancelled the order and the worst part is i can't apply coupon discount again for amazon pay because it is used already (whether the order delivered or not it is counted)", 'They definitely have an issue for delivery of low value orders as in most cases they would cancel the order post pickup', 'helpfu

In [24]:
# Process data
for date, reviews in daily_data.items():
    agent.process_daily_batch(pd.to_datetime(date).date(), reviews)

# Get trend report on a specific date (rolling window T-30 to T)
trend_report_df = agent.get_trend_report('2022-07-31')

In [25]:
pd.set_option("display.max_rows",None)

In [27]:
trend_report_df

Unnamed: 0_level_0,2022-07-01,2022-07-02,2022-07-03,2022-07-04,2022-07-05,2022-07-06,2022-07-07,2022-07-08,2022-07-09,2022-07-10,2022-07-11,2022-07-12,2022-07-13,2022-07-14,2022-07-15,2022-07-16,2022-07-17,2022-07-18,2022-07-19,2022-07-20,2022-07-21,2022-07-22,2022-07-23,2022-07-24,2022-07-25,2022-07-26,2022-07-27,2022-07-28,2022-07-29,2022-07-30,2022-07-31
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
Delivery issue,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
Food stale,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Delivery partner rude,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Maps not working properly,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Instamart should be open all night,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bring back 10 minute bolt delivery,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Payment issue,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Customer support unresponsive,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
new_topic 1: It's quite annoying when,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,47,86,69
new_topic 15: I ordered 2 wraps,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [98]:
trend_report_df.to_csv("Output.csv")