In [3]:
# Preparing enviroment:
#!pip install -U transformers datasets accelerate peft bitsandbytes trl

Collecting transformers
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.28.0-py3-none-any.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.1 kB)
Downloading transformers-5.2.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 MB

In [1]:
# Download resources:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig



In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

#QLoRA setup (4bits)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,  # T4 supports fp16 well
  )

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Model in 4-bit on GPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

print("Loaded:", model_name)

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 14.56 GiB of which 7.81 MiB is free. Including non-PyTorch memory, this process has 14.55 GiB memory in use. Of the allocated memory 14.35 GiB is allocated by PyTorch, and 94.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
# Quick smoke test:

prompt = "Write a 2-sentence summary of why product reviews are useful."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

out = model.generate(
    **inputs,
    max_new_tokens=60,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)

print(tokenizer.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Write a 2-sentence summary of why product reviews are useful. Product reviews provide valuable insights and feedback for both consumers and businesses, allowing them to make informed decisions based on real experiences and opinions.

Here are some additional facts about product reviews:

1. According to a BrightLocal study, 92% of consumers read online reviews for local businesses


In [4]:
## Adding LoRA configuration
from peft import LoraConfig, get_peft_model

# LoRA config:
lora_config = LoraConfig(
    r=16,                # rank (higher = more capacity, more VRAM)
    lora_alpha=32,       # scaling
    lora_dropout=0.05,   # regularization
    bias="none",
    task_type="CAUSAL_LM",
    # Target the key linear layers in Mistral blocks
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)

model = get_peft_model(model, lora_config)

# Show how many parameters will actually be trained (should be tiny %)
model.print_trainable_parameters()


trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758


In [7]:
# Merging the Sentiment file with the clustered file:
import pandas as pd

data01 = pd.read_csv('/content/comb_data_with_bert_sentiment.csv')
data02 = pd.read_csv('/content/clustered_data.csv')

data01.info()
data02.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67992 entries, 0 to 67991
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              67992 non-null  object 
 1   name            61232 non-null  object 
 2   brand           67992 non-null  object 
 3   reviews.date    67953 non-null  object 
 4   reviews.rating  67959 non-null  float64
 5   reviews.text    67991 non-null  object 
 6   reviews.title   67973 non-null  object 
 7   bert_sentiment  67992 non-null  object 
 8   bert_score      67992 non-null  float64
dtypes: float64(2), object(7)
memory usage: 4.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67992 entries, 0 to 67991
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              67992 non-null  object 
 1   name            61232 non-null  object 
 2   brand           67992 non-null  object 
 3   review

In [8]:
data1 = data01[['id' , 'name', 'reviews.text', 'reviews.title', 'bert_sentiment']]
data2 = data02[['id' , 'category']]

data2 = data2.drop_duplicates(subset='id', keep='first')

# merge data
final_data = data1.merge(data2, on='id', how='left')

# Save to new CSV
final_data.to_csv('final_data.csv', index=False)

final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67992 entries, 0 to 67991
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              67992 non-null  object
 1   name            61232 non-null  object
 2   reviews.text    67991 non-null  object
 3   reviews.title   67973 non-null  object
 4   bert_sentiment  67992 non-null  object
 5   category        67992 non-null  object
dtypes: object(6)
memory usage: 3.1+ MB


In [6]:
# Load data (choose the proper load option)
#data = final_data
import pandas as pd
data = pd.read_csv('/content/final_data.csv')

# Categories from clustering
data["sentiment"] = data["bert_sentiment"]
data["categories"] = data["category"]

# Keeping the usable columns:
data = data.dropna(subset=["id", "name", "categories", "sentiment", "reviews.text"])
data = data.copy()

In [7]:
# Converting sentiment into a numeric score:
data = data.copy()

data["sentiment_clean"] = (
    data["sentiment"]
      .astype(str)
      .str.strip()
      .str.upper()
)

sentiment_map = {
    "NEGATIVE": 0,
    "NEUTRAL": 1,
    "POSITIVE": 2
}

data["sentiment_score"] = data["sentiment_clean"].map(sentiment_map)

In [8]:
# Sanitiy check:
data["sentiment_score"].value_counts()

Unnamed: 0_level_0,count
sentiment_score,Unnamed: 1_level_1
2,56392
1,2496
0,2343


In [9]:
import numpy as np

# Aggregate product stats within each category
prod_stats = (data.groupby(["categories", "name"], as_index=False)
                .agg(
                    avg_rating=("sentiment_score", "mean"),
                    n_reviews=("sentiment_score", "size")
                ))

# Weighted ranking formula
prod_stats["score"] = (
    prod_stats["avg_rating"] *
    np.log1p(prod_stats["n_reviews"]) # Balances quality and popularity
)

TOP_K = 3

top_products = (
    prod_stats.sort_values(["categories", "score"], ascending=[True, False])
              .groupby("categories")
              .head(TOP_K)
              .reset_index(drop=True)
)

top_products.head()

Unnamed: 0,categories,name,avg_rating,n_reviews,score
0,Audio Devices,"Echo (White),,,\r\nEcho (White),,,",1.939144,3270,15.693201
1,Audio Devices,"Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,",1.936684,2527,15.174274
2,Audio Devices,Amazon - Amazon Tap Portable Bluetooth and Wi-...,1.95283,318,11.258439
3,Ebook Readers & Tablets,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",1.889568,10966,17.57798
4,Ebook Readers & Tablets,Amazon Kindle Paperwhite - eBook reader - 4 GB...,1.964106,3176,15.837945


In [10]:
# Visual check
for cat in sorted(top_products["categories"].unique()):
    print(f"\nCategory: {cat}")
    display(
        top_products[top_products["categories"] == cat][
            ["name", "avg_rating", "n_reviews", "score"]
        ]
    )


Category: Audio Devices


Unnamed: 0,name,avg_rating,n_reviews,score
0,"Echo (White),,,\r\nEcho (White),,,",1.939144,3270,15.693201
1,"Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,",1.936684,2527,15.174274
2,Amazon - Amazon Tap Portable Bluetooth and Wi-...,1.95283,318,11.258439



Category: Ebook Readers & Tablets


Unnamed: 0,name,avg_rating,n_reviews,score
3,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",1.889568,10966,17.57798
4,Amazon Kindle Paperwhite - eBook reader - 4 GB...,1.964106,3176,15.837945
5,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",1.928927,2814,15.320921



Category: Power & Batteries


Unnamed: 0,name,avg_rating,n_reviews,score
6,AmazonBasics AAA Performance Alkaline Batterie...,1.766511,8343,15.950353
7,AmazonBasics AA Performance Alkaline Batteries...,1.773069,3728,14.581531


In [11]:
#Creating evidence packs (1 per category: top products and representative reviews)
import random

random.seed(42)

def sample_reviews_for_product(df, category, product_name, n_pos=2, n_neu=1, n_neg=1):
    sub = df[(df["categories"] == category) & (df["name"] == product_name)].copy()
    sub = sub.dropna(subset=["reviews.text", "sentiment_score"])

    pos = sub[sub["sentiment_score"] == 2]["reviews.text"].drop_duplicates()
    neu = sub[sub["sentiment_score"] == 1]["reviews.text"].drop_duplicates()
    neg = sub[sub["sentiment_score"] == 0]["reviews.text"].drop_duplicates()

     # sample safely (works even if there are fewer than requested)
    pos_s = pos.sample(min(n_pos, len(pos)), random_state=42).tolist() if len(pos) else []
    neu_s = neu.sample(min(n_neu, len(neu)), random_state=42).tolist() if len(neu) else []
    neg_s = neg.sample(min(n_neg, len(neg)), random_state=42).tolist() if len(neg) else []

    # keep short-ish snippets (helps model + token length)
    def clip(t, max_chars=160):
        t = " ".join(str(t).split())
        return t[:max_chars] + ("..." if len(t) > max_chars else "")

    samples = [("POS", clip(t)) for t in pos_s] + [("NEU", clip(t)) for t in neu_s] + [("NEG", clip(t)) for t in neg_s]
    return samples


In [12]:
# Building evidence packs(for each category):
def build_evidence_pack(category, top_products_cat, df):
    lines = []
    lines.append(f"Category: {category}")
    lines.append("You must only use the evidence below. Do not invent product features.\n")

    for _, row in top_products_cat.iterrows():
        name = row["name"]
        lines.append(f"Product: {name}")
        lines.append(f"- Avg sentiment (0-2): {row['avg_rating']:.3f}")
        lines.append(f"- Number of reviews: {int(row['n_reviews'])}")

        samples = sample_reviews_for_product(df, category, name)
        lines.append("Review snippets (label: text):")
        for lab, text in samples:
            lines.append(f"  - {lab}: {text}")
        lines.append("")  # blank line between products

    return "\n".join(lines)

# Build packs for all categories
evidence_packs = {}
for cat in top_products["categories"].unique():
    top_cat = top_products[top_products["categories"] == cat]
    evidence_packs[cat] = build_evidence_pack(cat, top_cat, data)

# Preview one
print(list(evidence_packs.values())[0][:1200])


Category: Audio Devices
You must only use the evidence below. Do not invent product features.

Product: Echo (White),,,
Echo (White),,,
- Avg sentiment (0-2): 1.939
- Number of reviews: 3270
Review snippets (label: text):
  - POS: Have the black and the white, identically wonderful.
  - POS: Still learning. I just bought a firestick w/Alexa capabilities. We will see....
  - NEU: I just brought 4 and NO I has had success pairing the device. Amazon doesn't have any tech assistance and the directions are for a remote but w-o a smart TV, I ...
  - NEG: This device is not as great as advertised. It does not do as many things as expected and you have to go to a lot of trouble to do otherwise

Product: Amazon Fire Tv,,,
Amazon Fire Tv,,,
- Avg sentiment (0-2): 1.937
- Number of reviews: 2527
Review snippets (label: text):
  - POS: I like Alexa a lot the speaker is amazing I use it for my home stereo. We also use it a lot in the kitchen for recipes!!!!
  - POS: Very user friendly product. 10

In [13]:
# Generating draft article sections:

import torch

def generate_article_section(evidence_text, max_new_tokens=500):
    prompt = f"""
You are a product review analyst.

Using ONLY the evidence below, write a recommendation article section.

Requirements:
- Recommend the best products
- For each product include:
  - Pros
  - Cons
  - Best for
  - Final verdict
- End with a short comparison.

Evidence:
{evidence_text}
""".strip()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.4,
            top_p=0.9,
            repetition_penalty=1.1
        )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text


In [16]:
# Generating target for all categories_01:
article_targets = {}

for cat, evidence_text in evidence_packs.items():
    print(f"Generating article for category: {cat}")
    article_targets[cat] = generate_article_section(evidence_text)

print("Done.")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating article for category: Audio Devices


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating article for category: Ebook Readers & Tablets


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating article for category: Power & Batteries
Done.


In [17]:
# Trimming Long generated articles:
def trim_text_to_tokens(text, max_tokens=700):
    ids = tokenizer(text, add_special_tokens=False)["input_ids"]
    ids = ids[:max_tokens]
    return tokenizer.decode(ids, skip_special_tokens=True)

for cat in article_targets:
    article_targets[cat] = trim_text_to_tokens(
        article_targets[cat],
        max_tokens=700
    )

In [18]:
# Checking one result:
print(article_targets[list(article_targets.keys())[0]][:1500])

You are a product review analyst.

Using ONLY the evidence below, write a recommendation article section.

Requirements:
- Recommend the best products
- For each product include:
  - Pros
  - Cons
  - Best for
  - Final verdict
- End with a short comparison.

Evidence:
Category: Audio Devices
You must only use the evidence below. Do not invent product features.

Product: Echo (White),,,
Echo (White),,,
- Avg sentiment (0-2): 1.939
- Number of reviews: 3270
Review snippets (label: text):
  - POS: Have the black and the white, identically wonderful.
  - POS: Still learning. I just bought a firestick w/Alexa capabilities. We will see....
  - NEU: I just brought 4 and NO I has had success pairing the device. Amazon doesn't have any tech assistance and the directions are for a remote but w-o a smart TV, I ...
  - NEG: This device is not as great as advertised. It does not do as many things as expected and you have to go to a lot of trouble to do otherwise

Product: Amazon Fire Tv,,,
Amazo

In [19]:
# Buiding the qLoRA training dataset:

rows = []
for cat in evidence_packs.keys():
    rows.append({
        "category": cat,
        "prompt": evidence_packs[cat],
        "response": article_targets[cat],
    })

train_df = pd.DataFrame(rows)
train_df.head()

Unnamed: 0,category,prompt,response
0,Audio Devices,Category: Audio Devices\nYou must only use the...,You are a product review analyst.\n\nUsing ONL...
1,Ebook Readers & Tablets,Category: Ebook Readers & Tablets\nYou must on...,You are a product review analyst.\n\nUsing ONL...
2,Power & Batteries,Category: Power & Batteries\nYou must only use...,You are a product review analyst.\n\nUsing ONL...


In [20]:
# Convert to Hugging Face datest

from datasets import Dataset

ds = Dataset.from_pandas(train_df)

def format_for_mistral(ex):
    # This builds the single text string the model will learn from
    return {
        "text": f"<s>[INST]\n{ex['prompt']}\n[/INST]\n{ex['response']}</s>"
    }

ds = ds.map(format_for_mistral, remove_columns=ds.column_names)
ds[0]["text"][:1200]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

"<s>[INST]\nCategory: Audio Devices\nYou must only use the evidence below. Do not invent product features.\n\nProduct: Echo (White),,,\r\nEcho (White),,,\n- Avg sentiment (0-2): 1.939\n- Number of reviews: 3270\nReview snippets (label: text):\n  - POS: Have the black and the white, identically wonderful.\n  - POS: Still learning. I just bought a firestick w/Alexa capabilities. We will see....\n  - NEU: I just brought 4 and NO I has had success pairing the device. Amazon doesn't have any tech assistance and the directions are for a remote but w-o a smart TV, I ...\n  - NEG: This device is not as great as advertised. It does not do as many things as expected and you have to go to a lot of trouble to do otherwise\n\nProduct: Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,\n- Avg sentiment (0-2): 1.937\n- Number of reviews: 2527\nReview snippets (label: text):\n  - POS: I like Alexa a lot the speaker is amazing I use it for my home stereo. We also use it a lot in the kitchen for recipes!!!!\n  - PO

In [21]:
# Validation split:

ds_split = ds.train_test_split(test_size=0.2, seed=42)
train_ds = ds_split["train"]
eval_ds  = ds_split["test"]

print("Train examples:", len(train_ds))
print("Eval examples:", len(eval_ds))

Train examples: 2
Eval examples: 1


In [22]:
# Tokenize (with a safe max length)
MAX_LEN = 2048

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["text"])
eval_tok  = eval_ds.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [23]:
# quick check (making sure it´s not truncating too much):

lengths = [len(x) for x in train_tok["input_ids"]]
print("Max tokens:", max(lengths))
print("Avg tokens:", sum(lengths)/len(lengths))

Max tokens: 1372
Avg tokens: 1339.0


In [24]:
# Double checking:
MAX_LEN = 2048

lengths = [len(x) for x in train_tok["input_ids"]]
print("Max tokens:", max(lengths))
print("Avg tokens:", sum(lengths)/len(lengths))
print("Pct hitting MAX_LEN:", sum(l == MAX_LEN for l in lengths)/len(lengths))

Max tokens: 1372
Avg tokens: 1339.0
Pct hitting MAX_LEN: 0.0


In [25]:
# QLoRA training:
MAX_LEN = 2048

def tokenize_for_causal_lm(batch):
    out = tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )
    # For causal LM training, labels are the same as input_ids
    out["labels"] = out["input_ids"].copy()
    return out

train_tok = train_ds.map(tokenize_for_causal_lm, batched=True, remove_columns=["text"])
eval_tok  = eval_ds.map(tokenize_for_causal_lm, batched=True, remove_columns=["text"])


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [26]:
# Trimming target article lenght
lengths = [len(x) for x in train_tok["input_ids"]]
print("Max tokens:", max(lengths))
print("Avg tokens:", sum(lengths)/len(lengths))

Max tokens: 1372
Avg tokens: 1339.0


In [27]:
# Data collator por dynamic paddig:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
# Training arguments:
from transformers import TrainingArguments, Trainer

model.gradient_checkpointing_enable()
model.config.use_cache = False  # important with gradient checkpointing

training_args = TrainingArguments(
    output_dir="./qlora_article_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,   # effective batch size 8
    learning_rate=2e-4,
    num_train_epochs=10,

    logging_steps=1,
    save_strategy="steps",
    save_steps=20,

    fp16=True,
    optim="paged_adamw_8bit",        # works with bitsandbytes

    eval_strategy="steps",
    eval_steps=20,
)

In [29]:
# Train:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("./qlora_article_adapter")
tokenizer.save_pretrained("./qlora_article_adapter")

Step,Training Loss,Validation Loss


('./qlora_article_adapter/tokenizer_config.json',
 './qlora_article_adapter/chat_template.jinja',
 './qlora_article_adapter/tokenizer.json')

In [30]:
# Generating one category section from an evidence pack:
def generate_category_section(category_name, evidence_text, max_new_tokens=650):
    prompt = f"""
You are writing a consumer recommendation article.

Use ONLY the evidence below. Do not invent product features.

Write the section for category: {category_name}

For each product:
- Pros (bullets)
- Cons (bullets)
- Best for (one line)
- Verdict (short paragraph)

End with a brief comparison.

Evidence:
{evidence_text}
""".strip()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.4,
            top_p=0.9,
            repetition_penalty=1.1
        )

    text = tokenizer.decode(out[0], skip_special_tokens=True)

    # Optional: remove the prompt echo if the model repeats it
    return text.split("Evidence:", 1)[-1].strip() if "Evidence:" in text else text

In [33]:
# Generating the full article:
sections = []

for cat in sorted(evidence_packs.keys()):
    print("Generating:", cat)
    section = generate_category_section(cat, evidence_packs[cat])
    sections.append(section)

article = "\n\n---\n\n".join(sections)

print(article[:2000])




Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating: Audio Devices


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating: Ebook Readers & Tablets


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generating: Power & Batteries
Category: Audio Devices
You must only use the evidence below. Do not invent product features.

Product: Echo (White),,,
Echo (White),,,
- Avg sentiment (0-2): 1.939
- Number of reviews: 3270
Review snippets (label: text):
  - POS: Have the black and the white, identically wonderful.
  - POS: Still learning. I just bought a firestick w/Alexa capabilities. We will see....
  - NEU: I just brought 4 and NO I has had success pairing the device. Amazon doesn't have any tech assistance and the directions are for a remote but w-o a smart TV, I ...
  - NEG: This device is not as great as advertised. It does not do as many things as expected and you have to go to a lot of trouble to do otherwise

Product: Amazon Fire Tv,,,
Amazon Fire Tv,,,
- Avg sentiment (0-2): 1.937
- Number of reviews: 2527
Review snippets (label: text):
  - POS: I like Alexa a lot the speaker is amazing I use it for my home stereo. We also use it a lot in the kitchen for recipes!!!!
  - POS: Ve

In [34]:
# Saving artice:
#As TXT file:
with open("generated_article.txt", "w", encoding="utf-8") as f:
    f.write(article)

print("Article saved!")



Article saved!


In [None]:
# Final comparison table form top_products:
import pandas as pd

def build_comparison_table(top_products_df):
    # Expecting columns: categories, name, avg_rating, n_reviews, score
    rows = []
    for cat in sorted(top_products_df["categories"].unique()):
        sub = top_products_df[top_products_df["categories"] == cat].copy()
        sub = sub.sort_values("score", ascending=False).head(3)

        for rank, (_, r) in enumerate(sub.iterrows(), start=1):
            rows.append({
                "Category": str(cat),
                "Rank": rank,
                "Product": r["name"],
                "Avg rating (0–2)": round(float(r["avg_rating"]), 3),
                "Reviews": int(r["n_reviews"]),
                "Weighted score": round(float(r["score"]), 3),
            })

    table_df = pd.DataFrame(rows)
    return table_df

comparison_df = build_comparison_table(top_products)
comparison_df.head()

Unnamed: 0,Category,Rank,Product,Avg rating (0–2),Reviews,Weighted score
0,Audio Devices,1,"Echo (White),,,\r\nEcho (White),,,",1.939,3270,15.693
1,Audio Devices,2,"Amazon Fire Tv,,,\r\nAmazon Fire Tv,,,",1.937,2527,15.174
2,Audio Devices,3,Amazon - Amazon Tap Portable Bluetooth and Wi-...,1.953,318,11.258
3,Ebook Readers & Tablets,1,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",1.89,10966,17.578
4,Ebook Readers & Tablets,2,Amazon Kindle Paperwhite - eBook reader - 4 GB...,1.964,3176,15.838


In [None]:
#Saving comparison as csv
comparison_df.to_csv("comparison_table.csv", index=False)

In [35]:
# ============================================================
# GENERADOR DE REVIEWS NUEVAS - Usando el modelo fine-tuneado
# Agregar al final del notebook, después de cargar el modelo
# ============================================================

import torch
import re

def generate_new_review(product_name, category, pos_reviews, neu_reviews, neg_reviews,
                        avg_sentiment, n_reviews, max_new_tokens=400):
    """
    Genera una review NUEVA en primera persona para un producto,
    basándose en las reseñas reales del dataset.
    """
    evidence = "\n".join([
        f"  - POS: {r}" for r in pos_reviews
    ] + [
        f"  - NEU: {r}" for r in neu_reviews
    ] + [
        f"  - NEG: {r}" for r in neg_reviews
    ])

    prompt = f"""You are a verified buyer writing an honest product review.\n\nUsing ONLY the evidence below, write a NEW first-person review.\n\nRules:\n- Write as if YOU personally used the product\n- 3 short paragraphs: overall impression, pros/cons, final verdict\n- Do NOT copy sentences from the evidence verbatim\n- Do NOT invent features not mentioned\n- Be balanced and natural\n\nProduct: {product_name}\nCategory: {category}\nSentiment score (0=negative, 2=positive): {avg_sentiment:.2f}\nTotal reviews in dataset: {n_reviews}\n\nEvidence from real reviews:\n{evidence}\n\nMy review:""".strip()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.15,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    # New tokens only (no prompt repetition)
    gen_ids = output_ids[0][prompt_len:]
    review = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    # Cleaning residual noise
    review = re.sub(r"\n{3,}", "\n\n", review)
    return review


In [39]:
# ============================================================
# GENERADOR DE REVIEWS NUEVAS - Usando el modelo fine-tuneado
# Agregar al final del notebook, después de cargar el modelo
# ============================================================

import torch
import re

def generate_new_review(product_name, category, pos_reviews, neu_reviews, neg_reviews,
                        avg_sentiment, n_reviews, max_new_tokens=400):

    evidence = "\n".join(
        [f"  - POS: {r}" for r in pos_reviews] +
        [f"  - NEU: {r}" for r in neu_reviews] +
        [f"  - NEG: {r}" for r in neg_reviews]
    )

    prompt = f"""[INST] You are a verified buyer. Write a short honest product review in plain English.

Product: {product_name}
Category: {category}

Evidence from real reviews:
{evidence}

Write 3 short paragraphs: (1) overall impression, (2) pros and cons, (3) verdict.
Do NOT use LaTeX, math, or code. Do NOT repeat "Question". Write plain English only. [/INST]
My review:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_len = inputs["input_ids"].shape[1]

    # Token IDs to stop generation immediately if hit
    bad_words = tokenizer(
        ["Question:", "\\label", "\\frac", "\\begin", "Q:", "0000"],
        add_special_tokens=False
    ).input_ids

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,              # greedy — more stable than sampling
            repetition_penalty=1.3,       # stronger penalty to avoid loops
            bad_words_ids=bad_words,      # hard-block degenerate tokens
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    gen_ids = output_ids[0][prompt_len:]
    review = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    # Post-processing: cut at first degeneration sign
    for stop_signal in ["Question:", "\\label", "\\frac", "Q:", "00000"]:
        if stop_signal in review:
            review = review[:review.index(stop_signal)].strip()

    return review if len(review) > 50 else "[Generation failed — model too degraded for this product]"


def generate_reviews_for_top_products(top_products_df, data_df, top_k=3):
    """
    Recorre las categorías, toma los top_k productos y genera una review por cada uno.
    Devuelve un DataFrame con los resultados.
    """
    results = []

    for cat in sorted(top_products_df["categories"].unique()):
        top_cat = (top_products_df[top_products_df["categories"] == cat]
                   .sort_values("score", ascending=False)
                   .head(top_k))

        print(f"\n{'='*60}")
        print(f"📦 Categoría: {cat}")
        print(f"{'='*60}")

        for rank, (_, row) in enumerate(top_cat.iterrows(), start=1):
            product_name = str(row["name"]).replace("\r\n", " ").replace(",,,", "").strip()
            print(f"\n  [{rank}] {product_name}")

            # Obtener reseñas reales del dataset
            subset = data_df[
                (data_df["categories"] == cat) &
                (data_df["name"] == row["name"])
            ].dropna(subset=["reviews.text", "sentiment_score"])

            def sample(df, score, n=2):
                pool = df[df["sentiment_score"] == score]["reviews.text"].drop_duplicates()
                return pool.sample(min(n, len(pool)), random_state=42).tolist() if len(pool) else []

            pos = sample(subset, 2)
            neu = sample(subset, 1)
            neg = sample(subset, 0)

            # Clip a 160 chars para no saturar el prompt
            def clip(t, max_chars=160):
                t = " ".join(str(t).split())
                return t[:max_chars] + ("..." if len(t) > max_chars else "")

            pos_clipped = [clip(r) for r in pos]
            neu_clipped = [clip(r) for r in neu]
            neg_clipped = [clip(r) for r in neg]

            print("  Generando review...", end=" ", flush=True)
            review = generate_new_review(
                product_name=product_name,
                category=cat,
                pos_reviews=pos_clipped,
                neu_reviews=neu_clipped,
                neg_reviews=neg_clipped,
                avg_sentiment=float(row["avg_rating"]),
                n_reviews=int(row["n_reviews"])
            )
            print("✅")
            print(f"\n  --- REVIEW GENERADA ---\n{review}\n")

            results.append({
                "category":     cat,
                "rank":         rank,
                "product":      product_name,
                "avg_sentiment": round(float(row["avg_rating"]), 3),
                "n_reviews":    int(row["n_reviews"]),
                "score":        round(float(row["score"]), 3),
                "generated_review": review
            })

    return pd.DataFrame(results)


# ── EJECUTAR ────────────────────────────────────────────────
reviews_df = generate_reviews_for_top_products(top_products, data, top_k=3)

# Guardar CSV
reviews_df.to_csv("generated_reviews.csv", index=False)
print("\n✅ Guardado: generated_reviews.csv")
reviews_df[["category", "rank", "product", "generated_review"]]


📦 Categoría: Audio Devices

  [1] Echo (White) Echo (White)
  Generando review... ✅

  --- REVIEW GENERADA ---
[Generation failed — model too degraded for this product]


  [2] Amazon Fire Tv Amazon Fire Tv
  Generando review... ✅

  --- REVIEW GENERADA ---
I’ Questionable(x\left[j} \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \

KeyboardInterrupt: 

In [34]:
# Downloading the review
reviews_df.to_csv("generated_reviews.csv", index=False);
from google.colab.files import download; download("generated_reviews.csv")
print("Reviews saved!")

NameError: name 'reviews_df' is not defined