Load All the library

In [1]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


 Download NLTK datasets

In [3]:
# Download NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

Load the clean data after removing noise for model 1 and model2

In [4]:
df=pd.read_csv("/content/final_dataset.csv")

In [5]:
print(df.head())

                     id                                               name  \
0  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
1  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
2  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
3  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
4  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   

    brand                                         categories  \
0  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
1  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
2  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
3  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
4  Amazon  Electronics,iPad & Tablets,All Tablets,Fire Ta...   

  primaryCategories reviews.doRecommend  reviews.numHelpful  star_rating  \
0               NaN                True                 0.0          5

In [6]:
df.columns

Index(['id', 'name', 'brand', 'categories', 'primaryCategories',
       'reviews.doRecommend', 'reviews.numHelpful', 'star_rating',
       'review_text', 'review_title', 'sentiment_m1', 'meta_category',
       'review_id'],
      dtype='object')

This below function give the top 3 products beased on category and number of reviews

In [7]:
def get_top3_products_by_meta(meta_name):
    # Ensure rating is numeric
    df["star_rating"] = pd.to_numeric(df["star_rating"], errors="coerce")

    # Filter rows by meta_category_name
    df_filtered = df[df["meta_category"] == meta_name]

    if df_filtered.empty:
        return f"No products found for meta category: {meta_name}"

    # Group by product and compute required fields
    product_stats = (
        df_filtered.groupby(["id", "name"], as_index=False)
        .agg(
            avg_rating=("star_rating", "count"),
            reviews_text=("review_text", list),  # collect all review texts
            meta_category=("meta_category", "first")
        )
    )

    # Sort by rating and select top 3
    top3 = product_stats.sort_values("avg_rating", ascending=False).head(3)

    return top3


In [8]:
print(get_top3_products_by_meta("Tablets_Computers"))

                      id                                               name  \
11  AVphgVaX1cnluZ0-DR74  Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...   
26  AVqkIhwDv8e3D1O-lebb  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
18  AVqVGWLKnnc1JgDc3jF1  Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...   

    avg_rating                                       reviews_text  \
11        9570  [Good basic tablet for checking email , web br...   
26        2577  [This product so far has not disappointed. My ...   
18        1482  [I purchased two of these tablet a pink and bl...   

        meta_category  
11  Tablets_Computers  
26  Tablets_Computers  
18  Tablets_Computers  


In [9]:
!pip install emoji --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

Preprocess the reviews

In [10]:

import re
import emoji

def preprocess_review(text):
    # 1. Remove first-person phrases
    first_person_patterns = [
        r"\bI\b", r"\bwe\b", r"\bmy\b", r"\bour\b", r"\bme\b",
        r"\bmine\b", r"\bus\b"
    ]
    for pattern in first_person_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # 2. Remove URLs and HTML tags
    text = re.sub(r"http\S+|www\S+", "", text)  # URLs
    text = re.sub(r"<.*?>", "", text)           # HTML tags

    # 3. Remove emojis
    text = emoji.replace_emoji(text, replace="")  # remove all emojis

    # 4. Remove ad phrases or repetitive boilerplate
    ad_phrases = [
        "Black Friday", "best buy", "craigslist", "Amazon", "sale", "free shipping",
        "click here", "buy now"
    ]
    for phrase in ad_phrases:
        text = re.sub(re.escape(phrase), "", text, flags=re.IGNORECASE)

    # 5. Remove extra whitespace, newlines, special characters
    text = re.sub(r"\s+", " ", text)        # multiple spaces → single space
    text = re.sub(r"[^A-Za-z0-9.,!?;:'\"()\s]", "", text)  # keep only text/punctuation
    text = text.strip()

    return text


In [11]:
# df = df[df["reviews.text"].str.split().str.len() > 10]
# df = df[df["reviews.text"].str.split().str.len() < 1024]
# df = df.drop_duplicates(subset="reviews.text")


In [None]:
# from sentence_transformers import SentenceTransformer, util
# from transformers import pipeline
# import numpy as np
# import nltk

# nltk.download('punkt')
# from nltk.tokenize import sent_tokenize

# # -------------------------------
# # Step 1: Extractive Summarization
# # -------------------------------
# prompt_text = (
#     "Add product name and Summarize the following review in 3 sentences. "
#     "Write in an objective tone and do not use first-person words like 'I' or 'we'. "
#     "Focus on the main points and overall sentiment."
# )

# text = " ".join(df['reviews.text'].dropna().astype(str))
# print(text)

# # Split into sentences
# sentences = sent_tokenize(text)



# # Get sentence embeddings
# embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')
# sentence_embeddings = embedder.encode(sentences, convert_to_tensor=True)

# # Compute similarity with the "document centroid"
# # document_embedding = np.mean(sentence_embeddings.cpu().numpy(), axis=0)
# document_embedding = sentence_embeddings.mean(dim=0, keepdim=True)

# cosine_scores = util.cos_sim(sentence_embeddings, document_embedding)

# # Pick top-N most representative sentences
# top_n = 10
# top_sentence_indices = np.argsort(-cosine_scores.squeeze().cpu().numpy())[:top_n]
# selected_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

# extractive_summary = " ".join(selected_sentences)
# print("Extractive Summary:\n", extractive_summary)

# # -------------------------------
# # Step 2: Abstractive Summarization
# # -------------------------------
# base_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
# model = PeftModel.from_pretrained(base_model, "lora-bart-cnn-adapter")
# summarizer = pipeline("summarization", model=model)

# final_summary = summarizer(extractive_summary, max_length=120, min_length=40, do_sample=False)
# print("\nFinal Abstractive Summary:\n", final_summary[0]['summary_text'])


loading "facebook/bart-large-cnn" model and tokenizer

In [None]:
# from transformers import BartTokenizer, BartForConditionalGeneration

# # Load tokenizer & model
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
# model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Model and tokenizer loaded successfully!")


created data set for optimization

In [None]:
# import pandas as pd
# from datasets import Dataset

# data = {
#     "reviews.text": [
#         "The phone is great but the battery drains quickly.",
#         "Amazing camera quality and display. Totally worth it!",
#         "Software crashes often. Not recommended."
#     ],
#     "summary": [
#         "Good phone, poor battery.",
#         "Great camera and display.",
#         "Buggy software, not recommended."
#     ]
# }

# df_set = pd.DataFrame(data)
# dataset = Dataset.from_pandas(df_set)


Tokenize the data

In [None]:
# def preprocess_function(examples):
#     model_inputs = tokenizer(
#         examples["reviews.text"],
#         max_length=512,
#         truncation=True
#     )
#     labels = tokenizer(
#         examples["summary"],
#         max_length=64,
#         truncation=True
#     )
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# tokenized_dataset = dataset.map(preprocess_function, batched=True)


Traing arguments for optimzation

In [None]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./review_summarizer",
#     # evaluation_strategy="no",      # change to "epoch" for larger data
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     num_train_epochs=1,
#     weight_decay=0.01,
#     save_total_limit=1,
#     logging_dir="./logs",
# )


train the pretrained model

In [None]:
# from transformers import DataCollatorForSeq2Seq
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     data_collator=data_collator,  # ✅ this fixes the ValueError
# )

# trainer.train()


In [21]:
# import torch
# torch.cuda.is_available()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

Save the model

In [25]:
# !zip -r fine_tuned_bart_review_summarizer.zip fine_tuned_bart_review_summarizer


In [26]:
!pip install transformers peft accelerate datasets




train the same model using peft because fine tune model is too big

In [27]:
# from transformers import BartTokenizer, BartForConditionalGeneration
# from peft import LoraConfig, get_peft_model

# model_name = "facebook/bart-large-cnn"

# tokenizer = BartTokenizer.from_pretrained(model_name)
# model = BartForConditionalGeneration.from_pretrained(model_name)

# # Configure LoRA
# lora_config = LoraConfig(
#     r=8,                # rank of LoRA matrices
#     lora_alpha=16,      # scaling factor
#     target_modules=["q_proj", "v_proj"],  # typical for transformer layers
#     lora_dropout=0.1,
#     bias="none",
#     task_type="SEQ_2_SEQ_LM"  # because BART is encoder-decoder
# )

# # Wrap the model
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()

In [28]:
# from datasets import load_dataset

# dataset = load_dataset("cnn_dailymail", "3.0.0")
# train_text = dataset["train"][0]["article"]
# train_summary = dataset["train"][0]["highlights"]


In [29]:
# print(train_summary)

In [30]:
# model.save_pretrained("lora-bart-cnn-adapter")


In the summerize_review function using 2 models

        one is for extract summary using "distilbert-base-nli-mean-tokens" model
        
        onther one is "facebook/bart-large-cnn" LoRA fine tune model for abstract summary

In [44]:
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration
import numpy as np
import nltk
from peft import PeftModel

nltk.download('punkt')
from nltk.tokenize import sent_tokenize
def summerize_review(reviews):
      # -------------------------------
      # Step 1: Extractive Summarization
      # -------------------------------
      prompt_text = (
          "Add product name and Summarize the following review in 3 sentences. "
          "Write in an objective tone and do not use first-person words like 'I' or 'we'. "
          "Focus on the main points and overall sentiment."
      )

      text = "".join(reviews)


      # Split into sentences
      sentences = sent_tokenize(preprocess_review(text))



      # Get sentence embeddings
      embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')
      sentence_embeddings = embedder.encode(sentences, convert_to_tensor=True)

      # Compute similarity with the "document centroid"
      # document_embedding = np.mean(sentence_embeddings.cpu().numpy(), axis=0)
      document_embedding = sentence_embeddings.mean(dim=0, keepdim=True)

      cosine_scores = util.cos_sim(sentence_embeddings, document_embedding)

      # Pick top-N most representative sentences
      top_n = 20
      top_sentence_indices = np.argsort(-cosine_scores.squeeze().cpu().numpy())[:top_n]
      selected_sentences = [sentences[i] for i in sorted(top_sentence_indices)]

      extractive_summary = " ".join(selected_sentences)
      print("Extractive Summary:\n", extractive_summary)

      # -------------------------------
      # Step 2: Abstractive Summarization
      # -------------------------------
      tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
      base_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
      model = PeftModel.from_pretrained(base_model, "lora-bart-cnn-adapter")
      summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

      final_summary = summarizer(extractive_summary, max_length=120, min_length=40, do_sample=False)
      return final_summary[0]['summary_text']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:

print(get_top3_products_by_meta('Smart_Home_Devices'))

                      id                                               name  \
12  AVpfl8cLLJeJML43AE3S                 Echo (White),,,\r\nEcho (White),,,   
6   AVpfl8cLLJeJML43AE3S             Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,   
18  AWFUWc8THh53nbDRF6YO  Amazon Echo Show Alexa-enabled Bluetooth Speak...   

    avg_rating                                       reviews_text  \
12        2613  [I really enjoy the Echo. I got an Echo Dot an...   
6         2278  [Echo is learning everyday and its uses daily,...   
18         596  [Great Gift for anyone. Very easy to setup. Co...   

         meta_category  
12  Smart_Home_Devices  
6   Smart_Home_Devices  
18  Smart_Home_Devices  


In [41]:
!unzip /content/lora-bart-cnn-adapter.zip


Archive:  /content/lora-bart-cnn-adapter.zip
   creating: lora-bart-cnn-adapter/
  inflating: lora-bart-cnn-adapter/README.md  
  inflating: lora-bart-cnn-adapter/adapter_model.safetensors  
  inflating: lora-bart-cnn-adapter/adapter_config.json  


Taking top 3 products and summarizing all the reviews of each product

In [45]:
top3_product=get_top3_products_by_meta('Tablets_Computers')


for idx, product in top3_product.iterrows():
    print(f"Product: {product['name']}")

    # reviews_text is already a list
    reviews_list = product['reviews_text']
    summary=summerize_review(reviews_list)

    print("summary of review:", summary)

    print("\n")

Product: Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta
Extractive Summary:
 It is a great buy.Nice tablet for the price. Great price too.Love this tablet easy to use. It is price well.great for the price, did research and found this  'm very please with the tablet haven't had any problem. highly recommend this tablet, it is very affordable! Quick delivery and service is great as alwaysThis is a great little Tablet for the price. Very easy to use with account.This was a great purchase do not regret buying it son enjoys playing with it alot!For the amount of money it is actually a good tablet.Love it! have used this for a while now and what can say .. just love it and worth buying ..Easy to use and simple tablet. Love it!This is a good tablet for what you pay for it. They love them.Nice tablet for the price. Good buy.Needed something to use when out and about and this was perfectThe tablet is great for the money. It is still a great buy and am satisfied with purc

Device set to use cuda:0


summary of review: The tablet is easy to use and has nice features. So far no issues, they seem to love it. The tablet is great for the money. It is still a great buy and am satisfied with purchase. With him is very easy to travel.


Product: All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta
Extractive Summary:
 Switched from another popular tablet brand and am happy with the choice made. wasn't really expecting much, however was pleasantly surprised.It feels solid, responsive, screen is good.For the price its a bargain! It is easy to use and simple to set up!Good tablet for the money. She loves it and the price can't be beat!This is a great tablet. Overall happy with it and the price was good for this type of tablet. could live without that, but overall, am pleased with this new tablet. Just the right size to.Liked this Kindle upgrade. It works great!Great little tablet for someone who is always on the go. am very pleased with purchase the tablet 

Device set to use cuda:0


summary of review: The tablet has a very clear and crisp screen. The display is great and the weight is not that big of a difference. The tablet is simple to set up and it's worth the price. Have bought a couple for gifts as well.


Product: Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Green Kid-Proof Case
Extractive Summary:
 kids loved and very happy for this tablet. grand kids really love this tablet and it has a lot of app options.Seemed to be fine as Chtistmas gift for granddaughter. Thanks for making a very durable product with an incredible warranty!He loves being on it and all the freebies that came with it.This is a good entry tablet for toddlers. love it, and he does, too.It is super easy to set daily limits and goals for children on the device. Kids will definitely be entertained by it!Great tablet. Reasonably priced for a great tablet the kids love! like it so and so kids like it very much.The application is amazing. kids like it, and the application is amazing. Also 

Device set to use cuda:0


summary of review: This is a good entry tablet for toddlers. It is super easy to set daily limits and goals for children on the device. The application is amazing. Bought 2 and kids are very happy. Well made for kids in mind.




In [46]:

!pip install evaluate --quiet
!pip install rouge_score --quiet
!pip install bert_score --quiet
from evaluate import load

predictions = ["Screen is nice enough and price was right. Works really well with account to access multiple features with extreme ease. 've enjoyed the ease of use and the price paid was great. The price was great and the product is nice.",
               "This is a great, easy to use tablet for the price. Works really well with account to access multiple features with extreme ease. The price was good and it has plenty of acc. great tablet, simple to use, just love it."]
references = ["The tablet works well, is easy to use, and offers good value for its price.",
              "Affordable and easy-to-use tablet with great screen quality and convenient features. Works fast, offers ample storage, and is highly recommended for the price."]

# ROUGE
rouge = load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE:", rouge_results)

# BERTScore
bertscore = load("bertscore")
bert_results = bertscore.compute(predictions=predictions, references=references, lang="en")
print("BERTScore:", bert_results)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE: {'rouge1': np.float64(0.3398268398268398), 'rouge2': np.float64(0.09664351851851852), 'rougeL': np.float64(0.22835497835497834), 'rougeLsum': np.float64(0.22835497835497834)}


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: {'precision': [0.8687559366226196, 0.8922217488288879], 'recall': [0.8964695334434509, 0.9034714698791504], 'f1': [0.8823952078819275, 0.8978114128112793], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.57.1)'}
