<a href="https://colab.research.google.com/github/sahidmiddaempinf/product-recommendation-rag/blob/main/Step_1_Product_Recommendation_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemini As A Judge for RAG Evals

## The RAG

### 1. Load the datasets

In [16]:
import json
import pandas as pd

# Update the file path to your products.json file:
file_path = 'drive/MyDrive/products.json'

# Load the JSON file
with open(file_path, 'r') as file:
    products_data = json.load(file)

# Convert the products list to a DataFrame for easy manipulation
products_df = pd.DataFrame(products_data['products'])
print("Loaded products:", products_df.shape[0])
products_df.head()

Loaded products: 30


Unnamed: 0,id,title,description,price,discountPercentage,rating,stock,brand,category,thumbnail,images
0,1,iPhone 9,An apple mobile which is nothing like apple,549,12.96,4.69,94,Apple,smartphones,https://i.dummyjson.com/data/products/1/thumbn...,[https://i.dummyjson.com/data/products/1/1.jpg...
1,2,iPhone X,"SIM-Free, Model A19211 6.5-inch Super Retina H...",899,17.94,4.44,34,Apple,smartphones,https://i.dummyjson.com/data/products/2/thumbn...,[https://i.dummyjson.com/data/products/2/1.jpg...
2,3,Samsung Universe 9,Samsung's new variant which goes beyond Galaxy...,1249,15.46,4.09,36,Samsung,smartphones,https://i.dummyjson.com/data/products/3/thumbn...,[https://i.dummyjson.com/data/products/3/1.jpg]
3,4,OPPOF19,OPPO F19 is officially announced on April 2021.,280,17.91,4.3,123,OPPO,smartphones,https://i.dummyjson.com/data/products/4/thumbn...,[https://i.dummyjson.com/data/products/4/1.jpg...
4,5,Huawei P30,Huawei’s re-badged P30 Pro New Edition was off...,499,10.58,4.09,32,Huawei,smartphones,https://i.dummyjson.com/data/products/5/thumbn...,[https://i.dummyjson.com/data/products/5/1.jpg...


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Packages

In [None]:
%%capture
!pip install qdrant-client[fastembed] google-genai pandas tqdm


In [56]:
import pandas as pd
import json
import os
import time
from tqdm import tqdm
from google import genai
from google.genai import types
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import uuid

from google.colab import userdata

QDRANT_URL = "https://09b01a49-2166-4305-a92c-9a1066c978bf.eu-west-2-0.aws.cloud.qdrant.io"
QDRANT_KEY = userdata.get('PERSONAL_QDRANT_KEY')

qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_KEY, port=None)

GEMINI_KEY = userdata.get('GEMINI_API_KEY')
gemini_client = genai.Client(
    api_key=GEMINI_KEY
)

In [69]:
# We choose a collection name for products
collection_name = "products"

# Create document strings, metadata, and IDs for each product.
documents = []
metadatas = []
ids = []

for index, row in products_df.iterrows():
    title = row['title']
    description = row['description']
    category = row.get('category')
    brand = row.get('brand')
    price  = row.get('price')
    rating = row.get('rating')
    discountPercentage = row.get('discountPercentage')

    # Build a document string.
    doc = f"Title: {title}\nDescription: {description}\nCategory: {category}\nBrand: {brand}\nPrice: {price}\nRating: {rating}\nDiscount: {discountPercentage}"
    documents.append(doc)

    # Save entire product info as metadata.
    metadatas.append(row.to_dict())

    # Use the product ID as the document id (convert to string).
    ids.append(int(row['id']))

print(f"Prepared {len(documents)} documents.")

Prepared 30 documents.


In [70]:
# from qdrant_client.http.models import CollectionInfo, Distance, VectorParams

# Check if the collection already exists; if not, create it.
try:
    qdrant_collection = qdrant_client.get_collection(collection_name=collection_name)
    print("Collection already exists!")
except Exception as e:
    print("Creating collection...")
    # Here, we choose 384 as the embedding dimension. Adjust it if your embedding model returns vectors with a different size.
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config={
          "fast-bge-small-en": VectorParams(size=384, distance=Distance.COSINE),
        }
    )
    print("Collection created!")


Creating collection...
Collection created!


In [71]:
# Add documents into Qdrant.
# Note: This operation might take a few moments, depending on the number of products.
qdrant_client.add(
    collection_name=collection_name,
    documents=documents,
    metadata=metadatas,
    ids=ids
)

print("Documents added to Qdrant.")

Documents added to Qdrant.


In [97]:
# Test search query
test_query = "smartphone with good battery life"
search_results = qdrant_client.query(collection_name=collection_name, query_text=test_query, limit=10, score_threshold=0.5)

print("Filtered Search Results:")
for result in search_results:
    # Depending on your Qdrant client version, you might have access to a 'score' attribute.
    # If not, the filtering will be done on the server side.
    score = result.score if hasattr(result, "score") else "N/A"
    print("Document:\n", result.document)
    print("Metadata:", result.metadata)
    print(f"Similarity Score: {score}")
    print("-----------------------------------")

Filtered Search Results:
Document:
 Title: iPhone 9
Description: An apple mobile which is nothing like apple
Category: smartphones
Brand: Apple
Price: 549
Rating: 4.69
Discount: 12.96
Metadata: {'document': 'Title: iPhone 9\nDescription: An apple mobile which is nothing like apple\nCategory: smartphones\nBrand: Apple\nPrice: 549\nRating: 4.69\nDiscount: 12.96', 'id': 1, 'title': 'iPhone 9', 'description': 'An apple mobile which is nothing like apple', 'price': 549, 'discountPercentage': 12.96, 'rating': 4.69, 'stock': 94, 'brand': 'Apple', 'category': 'smartphones', 'thumbnail': 'https://i.dummyjson.com/data/products/1/thumbnail.jpg', 'images': ['https://i.dummyjson.com/data/products/1/1.jpg', 'https://i.dummyjson.com/data/products/1/2.jpg', 'https://i.dummyjson.com/data/products/1/3.jpg', 'https://i.dummyjson.com/data/products/1/4.jpg', 'https://i.dummyjson.com/data/products/1/thumbnail.jpg']}
Similarity Score: 0.8596763
-----------------------------------
Document:
 Title: Samsung Un

In [59]:
def getGeminiResponse(prompt, max_tokens=8192, response_type="text/plain"):
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(
                    text=prompt
                ),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        temperature=0,
        top_p=0.95,
        top_k=40,
        max_output_tokens=max_tokens,
        response_mime_type=response_type,
    )
    response = gemini_client.models.generate_content(
        model="gemini-2.0-flash", contents=contents, config=generate_content_config
    )
    return response.text

getGeminiResponse("What is 2+3?")

'2 + 3 = 5\n'

In [99]:
def getRagResponse(question):
    search_result = qdrant_client.query(collection_name=collection_name, query_text=question)
    system_prompt = """
      You are an intelligent assistant designed to provide accurate and informative answers based on retrieved documents.

      Your primary task is to:

      Understand the user's query.
      Retrieve relevant information from the provided context (documents).
      Synthesize the retrieved information into a coherent and accurate response.
      Answer in a human tone while elaborating the answer.

      documents:

      """

    documents_text = ""

    doc_count = 1
    for result in search_result:
      documents_text += str(doc_count) + ": \n" + result.document + "\n\n"
      doc_count += 1

    users_query = "\n\n The user is asking: " + question

    prompt = system_prompt + documents_text + users_query

    response = getGeminiResponse(prompt)

    return response

# user_asked_query = "just list all the product categories, skincare there? count total"
# user_asked_query = "total products"
# user_asked_query = "most expensive product"
user_asked_query = "which is cheapest samsung phones i can buy?"

getRagResponse(user_asked_query)

'Based on the provided data, the cheapest Samsung phone available is the Samsung Universe 9, which is priced at $1249.\n'

### Imports