In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'amazon-fine-food-reviews' dataset.
Path to dataset files: /kaggle/input/amazon-fine-food-reviews


In [7]:
import pandas as pd

df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')
print(df.shape)
print(df.columns.tolist())
df.head()

(568454, 10)
['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [9]:
# Keep only relevant columns and sample 10k to keep things fast
df = df[['Score', 'Summary', 'Text']].dropna()
df = df.sample(10000, random_state=42).reset_index(drop=True)

# Combine summary + text into one field
df['full_review'] = df['Summary'] + ". " + df['Text']

print(df.shape)
df['full_review'].iloc[0]

(10000, 4)


'I like these!. These are actually very tasty.  Pure potatoes with a great texture and no nasty filler "stuff."  No bacon, no cheese...just tasty potatoes.  They cook well in either the oven or microwave.  I add a touch of either salt & pepper or fajita seasoning to spice it up.  I rated 4 out of 5 stars because they could be a bit bigger portion.  However, this item is a fairly good value for the money.'

In [10]:
import re

def clean_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'<.*?>', '', text)            # remove HTML tags
    text = re.sub(r'[^a-z0-9\s]', '', text)     # remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text).strip()     # remove extra spaces
    return text

df['cleaned_review'] = df['full_review'].apply(clean_text)

# Compare before and after
print("BEFORE:", df['full_review'].iloc[0])
print("\nAFTER:", df['cleaned_review'].iloc[0])

BEFORE: I like these!. These are actually very tasty.  Pure potatoes with a great texture and no nasty filler "stuff."  No bacon, no cheese...just tasty potatoes.  They cook well in either the oven or microwave.  I add a touch of either salt & pepper or fajita seasoning to spice it up.  I rated 4 out of 5 stars because they could be a bit bigger portion.  However, this item is a fairly good value for the money.

AFTER: i like these these are actually very tasty pure potatoes with a great texture and no nasty filler stuff no bacon no cheesejust tasty potatoes they cook well in either the oven or microwave i add a touch of either salt pepper or fajita seasoning to spice it up i rated 4 out of 5 stars because they could be a bit bigger portion however this item is a fairly good value for the money


In [11]:
!pip install sentence-transformers -q

from sentence_transformers import SentenceTransformer

# Load a small but powerful free model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert reviews to embeddings (this will take 2-3 mins)
print("Embedding reviews...")
embeddings = model.encode(df['cleaned_review'].tolist(), show_progress_bar=True)

print("Shape:", embeddings.shape)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding reviews...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Shape: (10000, 384)


In [12]:
!pip install bertopic -q

from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(df['cleaned_review'].tolist(), embeddings)

# See what topics were found
topic_model.get_topic_info()

2026-02-24 23:02:49,158 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-24 23:03:35,747 - BERTopic - Dimensionality - Completed ✓
2026-02-24 23:03:35,750 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-24 23:03:45,997 - BERTopic - Cluster - Completed ✓
2026-02-24 23:03:46,007 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-24 23:03:46,837 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2438,-1_the_and_to_it,"[the, and, to, it, these, in, of, they, for, are]",[would be okay if they didnt get hard after 5 ...
1,0,1158,0_coffee_cup_roast_kcups,"[coffee, cup, roast, kcups, strong, the, this,...",[great coffee great price this coffee is great...
2,1,410,1_cat_cats_food_she,"[cat, cats, food, she, her, he, eat, my, to, c...",[filler food is empty leaves your cat always n...
3,2,353,2_tea_teas_this_it,"[tea, teas, this, it, is, of, bags, mint, flav...",[i always come back to this tea i have become ...
4,3,320,3_dog_food_dogs_her,"[dog, food, dogs, her, she, he, his, my, to, and]",[wellness core puppy dry food i spent hours re...
...,...,...,...,...,...
119,118,11,118_velveeta_broccoli_goetta_cheese,"[velveeta, broccoli, goetta, cheese, dinner, p...",[not as good as other velveeta skillet product...
120,119,11,119_rooibos_tea_flowers_red,"[rooibos, tea, flowers, red, teas, hibiscus, h...",[tastes like real tea i am a tea lover who has...
121,120,10,120_cookie_oatmeal_cookies_soft,"[cookie, oatmeal, cookies, soft, quaker, loved...",[a cookies i am very impressed with the quaker...
122,121,10,121_licorice_tea_committed_evening,"[licorice, tea, committed, evening, advised, a...",[tried itloved itnow my favorite herbal tea ba...


In [13]:
# See topics clearly, skipping the outlier topic -1
topic_model.get_topic_info().iloc[1:15]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
1,0,1158,0_coffee_cup_roast_kcups,"[coffee, cup, roast, kcups, strong, the, this,...",[great coffee great price this coffee is great...
2,1,410,1_cat_cats_food_she,"[cat, cats, food, she, her, he, eat, my, to, c...",[filler food is empty leaves your cat always n...
3,2,353,2_tea_teas_this_it,"[tea, teas, this, it, is, of, bags, mint, flav...",[i always come back to this tea i have become ...
4,3,320,3_dog_food_dogs_her,"[dog, food, dogs, her, she, he, his, my, to, and]",[wellness core puppy dry food i spent hours re...
5,4,285,4_chips_potato_chip_these,"[chips, potato, chip, these, are, kettle, bag,...",[great chips i love these chips i am a potato ...
6,5,220,5_chocolate_cocoa_hot_dark,"[chocolate, cocoa, hot, dark, chocolates, milk...",[great dark i am a dark chocolate person and i...
7,6,204,6_dogs_dog_bones_teeth,"[dogs, dog, bones, teeth, chew, chews, these, ...",[great treat for your best friend i am sure my...
8,7,160,7_bars_bar_protein_are,"[bars, bar, protein, are, these, snack, they, ...",[kind bar these bars are delicious ive always ...
9,8,157,8_bread_flour_mix_gluten,"[bread, flour, mix, gluten, pancakes, baking, ...",[makes great bread makes great whole wheat bre...
10,9,145,9_treats_dogs_dog_treat,"[treats, dogs, dog, treat, them, these, loves,...",[great product for training your dog great pri...


In [14]:
df['topic'] = topics
df['topic_name'] = df['topic'].map(
    topic_model.get_topic_info().set_index('Topic')['Name']
)

df[['cleaned_review', 'topic', 'topic_name']].head(10)

Unnamed: 0,cleaned_review,topic,topic_name
0,i like these these are actually very tasty pur...,-1,-1_the_and_to_it
1,good but subjectively not 5 star i realize tha...,-1,-1_the_and_to_it
2,lipton cup a soup spring vegetable4 oz this is...,26,26_soup_soups_tomato_chicken
3,suited to its purpose if not quite its goal if...,62,62_margarita_crystal_mojito_mix
4,tastes artificial i was willing to give this a...,-1,-1_the_and_to_it
5,dog loves chicken fillets cant keep enough of ...,9,9_treats_dogs_dog_treat
6,chipotle tabasco sauce how do i love thee i lo...,11,11_sauce_hot_heat_chili
7,dbp323 recommend it to all lovely natural swee...,46,46_honey_raw_unfiltered_manuka
8,heavenly hot dogs a totally wonderful product ...,63,63_mustard_horseradish_jar_dijon
9,spicy thai chips totally orgasmic these chips ...,4,4_chips_potato_chip_these


In [15]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
bart_model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize_topic(topic_id, n_reviews=20):
    reviews = df[df['topic'] == topic_id]['cleaned_review'].head(n_reviews).tolist()
    combined = " ".join(reviews)[:3000]
    inputs = tokenizer(combined, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], max_length=100, min_length=30)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test on coffee topic
print("Topic: Coffee & K-cups")
print(summarize_topic(0))

Loading weights:   0%|          | 0/511 [00:00<?, ?it/s]

Topic: Coffee & K-cups


KeyboardInterrupt: 

In [16]:
!pip install datasets sentence-transformers bertopic -q

import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import re

# Load data
import kagglehub
path = kagglehub.dataset_download("snap/amazon-fine-food-reviews")
df = pd.read_csv(f'{path}/Reviews.csv')

# Prep
df = df[['Score', 'Summary', 'Text']].dropna()
df = df.sample(10000, random_state=42).reset_index(drop=True)
df['full_review'] = df['Summary'] + ". " + df['Text']

# Clean
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_review'] = df['full_review'].apply(clean_text)

# Embed
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['cleaned_review'].tolist(), show_progress_bar=True)

# Topic model
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(df['cleaned_review'].tolist(), embeddings)
df['topic'] = topics
df['topic_name'] = df['topic'].map(topic_model.get_topic_info().set_index('Topic')['Name'])

print("Done! Ready for summarization.")

Using Colab cache for faster access to the 'amazon-fine-food-reviews' dataset.


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2026-02-24 23:13:49,115 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-24 23:14:08,475 - BERTopic - Dimensionality - Completed ✓
2026-02-24 23:14:08,477 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-24 23:14:19,298 - BERTopic - Cluster - Completed ✓
2026-02-24 23:14:19,308 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-24 23:14:20,445 - BERTopic - Representation - Completed ✓


Done! Ready for summarization.


In [17]:
from transformers import BartForConditionalGeneration, BartTokenizer

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
bart_model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize_topic(topic_id, n_reviews=20):
    reviews = df[df['topic'] == topic_id]['cleaned_review'].head(n_reviews).tolist()
    combined = " ".join(reviews)[:3000]
    inputs = tokenizer(combined, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], max_length=100, min_length=30)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test on coffee topic
print("Topic: Coffee & K-cups")
print(summarize_topic(0))

Loading weights:   0%|          | 0/511 [00:00<?, ?it/s]

Topic: Coffee & K-cups
pretty sweet my picky cat loves the grass i put them next to a south facing window in the apartment and they have been growing well even during the winter the grains that do grow are pretty hearty just dont expect more than 60 of the seed to grow. If you cat is very inquisitive i suggest placing them in a location that is hard to reach i found out the hard way thank god i found this food.


In [18]:
# Summarize top 10 topics (skip -1 outliers)
top_topics = topic_model.get_topic_info().iloc[1:11]['Topic'].tolist()

results = []
for topic_id in top_topics:
    topic_name = topic_model.get_topic_info().set_index('Topic').loc[topic_id, 'Name']
    print(f"Summarizing: {topic_name}...")
    summary = summarize_topic(topic_id)
    results.append({'topic_id': topic_id, 'topic_name': topic_name, 'summary': summary})

# Save to dataframe and CSV
summary_df = pd.DataFrame(results)
summary_df.to_csv('topic_summaries.csv', index=False)
print("\nDone! All summaries saved.")
print(summary_df[['topic_name', 'summary']])

Summarizing: 0_cat_cats_food_she...
Summarizing: 1_dog_food_dogs_her...
Summarizing: 2_chips_potato_chip_these...
Summarizing: 3_tea_teas_this_it...
Summarizing: 4_chocolate_cocoa_hot_dark...
Summarizing: 5_bars_bar_protein_are...
Summarizing: 6_bread_flour_mix_gluten...
Summarizing: 7_cereal_cereals_flakes_bran...
Summarizing: 8_cookies_cookie_these_chocolate...
Summarizing: 9_sauce_hot_heat_spicy...

Done! All summaries saved.
                         topic_name  \
0               0_cat_cats_food_she   
1               1_dog_food_dogs_her   
2         2_chips_potato_chip_these   
3                3_tea_teas_this_it   
4        4_chocolate_cocoa_hot_dark   
5            5_bars_bar_protein_are   
6          6_bread_flour_mix_gluten   
7      7_cereal_cereals_flakes_bran   
8  8_cookies_cookie_these_chocolate   
9            9_sauce_hot_heat_spicy   

                                             summary  
0  pretty sweet my picky cat loves the grass i pu...  
1  too rich for my dog gave

In [19]:
# Save everything
df.to_csv('reviews_with_topics.csv', index=False)
summary_df.to_csv('topic_summaries.csv', index=False)

print("Saved!")
print("Download these two files from the Colab files panel on the left.")

Saved!
Download these two files from the Colab files panel on the left.


In [20]:
# Save the topic model
topic_model.save("bertopic_model")

print("Model saved!")



Model saved!
