<a href="https://colab.research.google.com/github/sandeepdcoder/SandeepFDC/blob/main/Product_Review_using_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Summarizing reviews based on user query semnatically with FAISS

In this section, we'll demonstrate fetching reviews that are related to user query semantically.



# Setup

In [None]:
%pip install -q faiss-cpu

In [None]:
import os
from openai import OpenAI
from google.colab import userdata

# Make sure you have set the OPENAI_API_KEY environment variable
# For Colab, you can use the "🔑" icon on the left panel to add your API key as a secret.
# Name the secret OPENAI_API_KEY.
# If you are running this outside of Colab, you can set it as an environment variable
# in your terminal: export OPENAI_API_KEY='your-api-key'
# Or you can uncomment the line below and replace 'your-api-key' with your actual key:
# os.environ['OPENAI_API_KEY'] = 'your-api-key'

client = OpenAI(
    # This is the default and can be omitted
    base_url="https://openrouter.ai/api/v1",
    api_key= userdata.get('API_KEY'),
)

model = "openai/gpt-4o-mini" #"gpt-4.1-mini-2025-04-14"

def generate_response(prompt, model=model, max_tokens=150):
    """
    Generates a response from the OpenAI API.

    Args:
        prompt: The input prompt for the model.
        model: The OpenAI model to use (default: "gpt-3.5-turbo").
        max_tokens: The maximum number of tokens in the generated response.

    Returns:
        The text of the generated response.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=0.7, # You can adjust the temperature
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error generating response: {e}"



# Generate 200 diverse product reviews
import random

def generate_product_reviews(count=200):
    """Generate diverse product reviews for different products with varied sentiments"""

    products = ["smartphone"] #["vacuum cleaner", "smartphone", "laptop", "headphones", "smartwatch", "tablet", "camera", "speaker", "fitness tracker", "e-reader"]

    positive_templates = [
        "This {product} is absolutely amazing! The {feature1} is outstanding and the {feature2} exceeded my expectations. {emotion} Highly recommended!",
        "I'm incredibly impressed with this {product}. The {feature1} works flawlessly and {feature2} is top-notch. {emotion} Worth every penny!",
        "Love this {product}! {feature1} is excellent, {feature2} is perfect, and the overall quality is superb. {emotion}",
        "Best {product} I've ever owned! The {feature1} is incredible and {feature2} makes it even better. {emotion} Five stars!",
        "This {product} has exceeded all my expectations. {feature1} is fantastic, {feature2} is great. {emotion} Couldn't be happier!",
    ]

    negative_templates = [
        "Very disappointed with this {product}. The {feature1} is terrible and {feature2} doesn't work properly. {emotion} Would not recommend.",
        "This {product} is a waste of money. {feature1} failed after a week and {feature2} is subpar. {emotion} Returning immediately.",
        "Absolutely frustrated with this {product}. The {feature1} is unreliable and {feature2} is poor quality. {emotion} Avoid this product!",
        "Terrible {product}. {feature1} doesn't meet expectations and {feature2} is disappointing. {emotion} Save your money.",
        "This {product} is a complete letdown. {feature1} broke quickly and {feature2} never worked right. {emotion} Worst purchase ever!",
    ]

    neutral_templates = [
        "This {product} is okay. The {feature1} is decent and {feature2} is average. {emotion} It works but nothing special.",
        "Mixed feelings about this {product}. {feature1} is good but {feature2} could be better. {emotion} Acceptable for the price.",
        "The {product} does what it's supposed to. {feature1} is fine, {feature2} is mediocre. {emotion} It's alright.",
        "Average {product}. {feature1} works as expected, {feature2} is neither great nor terrible. {emotion} Could be better.",
        "This {product} has pros and cons. {feature1} is satisfactory but {feature2} needs improvement. {emotion} Decent option.",
    ]

    features = {
        "battery life": ["battery life", "charging speed", "power management"],
        "performance": ["performance", "speed", "processing power"],
        "design": ["design", "build quality", "aesthetics"],
        "sound": ["sound quality", "audio clarity", "bass response"],
        "display": ["display quality", "screen brightness", "resolution"],
        "camera": ["camera quality", "photo clarity", "video recording"],
        "durability": ["durability", "build material", "longevity"],
        "price": ["value for money", "pricing", "cost-effectiveness"],
        "features": ["features", "functionality", "capabilities"],
        "connectivity": ["connectivity", "wireless connection", "Bluetooth"]
    }

    emotions_positive = [
        "I'm so happy with this purchase!",
        "Couldn't be more satisfied!",
        "This has changed my life!",
        "I'm telling everyone about this!",
        "Best decision ever!"
    ]

    emotions_negative = [
        "I'm extremely upset.",
        "So frustrated with this!",
        "I feel ripped off.",
        "This ruined my day.",
        "Never buying from this brand again!"
    ]

    emotions_neutral = [
        "It's what I expected.",
        "Nothing to write home about.",
        "Does the job.",
        "Could be worse.",
        "It's fine for now."
    ]

    reviews = []

    for i in range(count):
        product = random.choice(products)

        # Distribute sentiments: 40% positive, 35% negative, 25% neutral
        sentiment_roll = random.random()
        if sentiment_roll < 0.4:
            template = random.choice(positive_templates)
            emotion = random.choice(emotions_positive)
        elif sentiment_roll < 0.75:
            template = random.choice(negative_templates)
            emotion = random.choice(emotions_negative)
        else:
            template = random.choice(neutral_templates)
            emotion = random.choice(emotions_neutral)

        # Select two random feature categories
        feature_categories = random.sample(list(features.keys()), 2)
        feature1 = random.choice(features[feature_categories[0]])
        feature2 = random.choice(features[feature_categories[1]])

        review = template.format(
            product=product,
            feature1=feature1,
            feature2=feature2,
            emotion=emotion
        )

        reviews.append(review)

    return reviews

# Generate reviews
ALL_REVIEWS = generate_product_reviews(200)
print(f"Generated {len(ALL_REVIEWS)} product reviews")
print(f"\nSample reviews:")
for i in range(5):
    print(f"{i+1}. {ALL_REVIEWS[i]}")



# Create Embedding

In [None]:
# Create embeddings for all reviews using Qwen 3 model

import numpy as np
import time
from sentence_transformers import SentenceTransformer

# Load the model
embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

def create_embeddings(reviews, batch_size=100):
    """
    Create embeddings for all reviews using OpenAI's embedding model.
    Processes in batches to handle rate limits.
    """
    embeddings = []

    print(f"Creating embeddings for {len(reviews)} reviews...")

    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(reviews)-1)//batch_size + 1}...")

        try:
            # response = client.embeddings.create(
            #     model="text-embedding-3-small",  # Newer, more efficient model
            #     input=batch
            # )
            # print(f"Response type: {type(response)}")
            # print(f"Response: {response}")

            # Extract embeddings from response
            batch_embeddings = embedding_model.encode(batch) #[item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)

            # Small delay to avoid rate limits
            if i + batch_size < len(reviews):
                time.sleep(0.5)

        except Exception as e:
            print(f"Error creating embeddings for batch {i//batch_size + 1}: {e}")
            # Fallback: try smaller batch or individual items
            for review in batch:
                try:
                    # response = client.embeddings.create(
                    #     model="text-embedding-3-small",
                    #     input=[review]
                    # )
                    # embeddings.append(response.data[0].embedding)
                    embedding = embedding_model.encode(review)
                    embeddings.append(embedding)
                    time.sleep(0.2)
                except Exception as e2:
                    print(f"Error creating embedding: {e2}")
                    # Use zero vector as fallback
                    embeddings.append([0.0] * 1536)


    # Convert to numpy array
    embeddings_array = np.array(embeddings, dtype='float32')
    print(f"Created {len(embeddings)} embeddings with dimension {embeddings_array.shape[1]}")

    return embeddings_array

# Create embeddings for all reviews
review_embeddings = create_embeddings(ALL_REVIEWS)
print(f"\nEmbeddings shape: {review_embeddings.shape}")

# Build Index

In [None]:
# Build FAISS index for similarity search
import faiss

def build_faiss_index(embeddings):
    """
    Build a FAISS index for efficient similarity search.
    Uses IndexFlatL2 for exact search (L2 distance).
    """
    dimension = embeddings.shape[1]

    # Create FAISS index (using L2 distance for similarity)
    index = faiss.IndexFlatIP(dimension)

    # Add all embeddings to the index
    index.add(embeddings)

    print(f"FAISS index created with {index.ntotal} vectors")
    print(f"Index dimension: {dimension}")

    return index

# Build the FAISS index
faiss_index = build_faiss_index(review_embeddings)

# Store review mapping (index -> review text)
review_mapping = {i: review for i, review in enumerate(ALL_REVIEWS)}
print(f"\nReview mapping created for {len(review_mapping)} reviews")


# Search Setup

In [None]:
# Implement similarity search function
def search_similar_reviews(query, top_k=5):
    """
    Search for similar reviews based on a text query.

    Args:
        query: Text query to search for
        top_k: Number of similar reviews to return

    Returns:
        List of tuples (review_text, distance, index)
    """
    print(f"Searching for: '{query}'")
    print(f"Finding top {top_k} similar reviews...\n")

    try:
        # Create embedding for the query
        # response = client.embeddings.create(
        #     model="text-embedding-3-small",
        #     input=[query]
        # )
        # query_embedding = np.array([response.data[0].embedding], dtype='float32')

        query_embedding = embedding_model.encode(query)

        # Search in FAISS index
        distances, indices = faiss_index.search(query_embedding, top_k)

        # Retrieve the reviews
        results = []
        for i, (idx, distance) in enumerate(zip(indices[0], distances[0])):
            review_text = review_mapping[idx]
            results.append((review_text, float(distance), int(idx)))
            print(f"{i+1}. [Distance: {distance:.4f}] {review_text}")

        return results

    except Exception as e:
        print(f"Error during search: {e}")
        return []


# Function to summarize retrieved reviews
def summarize_reviews(query, top_k=5, prompting_strategy="zero-shot"):
    """
    Search for similar reviews and generate a summary using specified prompting strategy.

    Args:
        query: Search query
        top_k: Number of similar reviews to retrieve
        prompting_strategy: Prompting strategy to use ("zero-shot", "few-shot", "chain-of-thought")

    Returns:
        Summary text
    """
    # Search for similar reviews
    similar_reviews = search_similar_reviews(query, top_k)

    if not similar_reviews:
        return "No reviews found."

    # Prepare reviews text
    reviews_text = "\n\n".join([f"Review {i+1}: {review[0]}"
                                 for i, review in enumerate(similar_reviews)])

    print(f"\n{'='*80}")
    print(f"SUMMARIZING WITH {prompting_strategy.upper()} PROMPTING")
    print(f"{'='*80}\n")

    # Choose prompting strategy
    if prompting_strategy == "zero-shot":
        prompt = f"""Summarize the following product reviews related to '{query}':

{reviews_text}

Provide a concise summary highlighting the main points, common themes, and overall sentiment."""

    elif prompting_strategy == "few-shot":
        prompt = f"""Summarize the following product reviews:

Example 1:
Reviews about "battery issues":
Review 1: Battery dies too quickly
Review 2: Charging takes forever
Summary: Multiple users report significant battery problems including short battery life and slow charging.

Example 2:
Reviews about "great sound":
Review 1: Amazing audio quality!
Review 2: Crystal clear sound
Summary: Users consistently praise the excellent sound quality and audio clarity.

Now summarize these reviews related to '{query}':
{reviews_text}

Summary:"""

    elif prompting_strategy == "chain-of-thought":
        prompt = f"""Summarize the following product reviews related to '{query}':

{reviews_text}

Let's think step by step:
1. Identify the main topics mentioned across reviews
2. Determine the overall sentiment (positive, negative, or mixed)
3. Note any common patterns or recurring issues/praise
4. Combine into a clear, concise summary

Summary:"""

    else:
        prompt = f"Summarize these reviews:\n\n{reviews_text}"

    # Generate summary
    try:
        summary = generate_response(prompt, max_tokens=300)
        print(f"Summary ({prompting_strategy}):")
        print(summary)
        return summary
    except Exception as e:
        print(f"Error generating summary: {e}")
        return None


# Search Reviews

In [None]:
# Demo: Test with various queries

print("="*80)
print("DEMO: VECTOR SEARCH & SUMMARIZATION WITH DIFFERENT PROMPTING STRATEGIES")
print("="*80)

# Test Query 1: Battery life issues
print("\n\n" + "🔍 TEST 1: Battery Life Issues".center(80, "=") + "\n")
summarize_reviews("battery life problems", top_k=5, prompting_strategy="zero-shot")


In [None]:
# Test Query 2: Great sound quality with Few-Shot
print("\n\n" + "🔍 TEST 2: Sound Quality (Few-Shot)".center(80, "=") + "\n")
summarize_reviews("excellent sound quality", top_k=5, prompting_strategy="few-shot")


In [None]:
# Test Query 3: Value for money with Chain-of-Thought
print("\n\n" + "🔍 TEST 3: Value for Money (Chain-of-Thought)".center(80, "=") + "\n")
summarize_reviews("value for money", top_k=5, prompting_strategy="chain-of-thought")


In [None]:
# Test Query 4: Durability concerns
print("\n\n" + "🔍 TEST 4: Durability Concerns".center(80, "=") + "\n")
summarize_reviews("durability issues and build quality", top_k=5, prompting_strategy="zero-shot")


# How This Vector Search System Works

**1. Review Generation**: We generated 200 diverse product reviews covering multiple products with varied sentiments and emotional tones.

**2. Embedding Creation**: Each review is converted into a vector embedding using OpenAI's text-embedding-3-small model.

**3. FAISS Index**: All embeddings are stored in a FAISS index which enables extremely fast similarity search.

**4. Semantic Search**: When a user queries, the system converts it to an embedding and finds the most similar reviews.

**5. Smart Summarization**: Retrieved reviews are summarized using different prompting strategies (Zero-Shot, Few-Shot, Chain-of-Thought).

**Key Benefits**: Fast semantic search, scalable to millions of reviews, flexible prompting strategies.