# The Second Agent - estimate the actual value of a product

## RAG (Retrieval Augmented Generation) based on a dataset of 400,000 scraped Amazon products

#### For our 2nd agent, we will be asking DeekSeek to estimate the price of one of our deals - and we will give it a hand.

It turns out that LLMs are really good at this! Out of the box, GPT-4o is off by an average of \$76.

But we can do even better: we'll provide it with some context, in the form of 5 similar products from our training dataset

Again I'll be going quite quickly through this - the idea is for you to run this yourself.

In [None]:
# imports

import os
import re
import logging
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
import chromadb
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from litellm import completion
from tqdm import tqdm
from IPython.display import display, Markdown
from evaluator import evaluate

In [None]:
# environment

load_dotenv(override=True)
DB = "products_vectorstore"

In [None]:
# Log in to HuggingFace
# If you don't have a HuggingFace account, you can set one up for free at www.huggingface.co
# And then add the HF_TOKEN to your .env file as explained in the project README

hf_token = os.environ['HF_TOKEN']
login(token=hf_token, add_to_git_credential=False)

# For following along at home:

Please download the files train.pkl and test.pkl from this Google Drive folder:  
https://drive.google.com/drive/folders/1t0YnoCXCbo2g08uWIOR6TPKR2-6Egb_g?usp=sharing

And place them in the parent directory (the directory called agentic).

In [None]:
# Load the training data

with open('../train.pkl', 'rb') as file:
    train = pickle.load(file)

In [None]:
print(f"There are {len(train):,} training items scraped from Amazon, and the first one is {train[0]}")

# Now create a Chroma Datastore

Now we will use the free, open-source Vector database Chroma.  
We will create a Chroma datastore with 400,000 products from our training dataset.

In [None]:
client = chromadb.PersistentClient(path=DB)

# Introducing the SentenceTransformer Encoding LLM

The all-MiniLM is a very useful model from HuggingFace that maps sentences & paragraphs to 384 dimensional vectors and is ideal for tasks like semantic search.

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

It can run pretty quickly locally.

As an alternative, OpenAI provides a closed-source Embeddings model. Benefits compared to OpenAI embeddings:
1. It's free and fast!
3. We can run it locally, so the data never leaves our box - might be useful if you're building a personal RAG

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Pass in a list of texts, get back a numpy array of vectors

vector = model.encode(["A room full of AI engineers"])[0]
print(vector.shape)
vector

## With that background, let's populate our Chroma database

### By calculating vectors for 400,000 scraped products

In [None]:
# Check if the collection exists; if not, create it

collection_name = "products"
existing_collection_names = [collection.name for collection in client.list_collections()]

if collection_name not in existing_collection_names:
    collection = client.create_collection(collection_name)
    for i in tqdm(range(0, len(train), 1000)):
        documents = [item.text for item in train[i: i+1000]]
        vectors = model.encode(documents).astype(float).tolist()
        metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
        ids = [f"doc_{j}" for j in range(i, i+1000)]
        collection.add(
            ids=ids,
            documents=documents,
            embeddings=vectors,
            metadatas=metadatas
        )
collection = client.get_or_create_collection(collection_name)

# Let's visualize the vectorized data

In [None]:
# It is very fun turning this up to 400_000 and seeing the full dataset visualized,
# but it almost crashes my box every time so do that at your own risk!! 5_000 is safe!

MAXIMUM_DATAPOINTS = 5_000

In [None]:
CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']
COLORS = ['cyan', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'red']

In [None]:
# Prework
result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)
vectors = np.array(result['embeddings'])
documents = result['documents']
categories = [metadata['category'] for metadata in result['metadatas']]
colors = [COLORS[CATEGORIES.index(c)] for c in categories]

In [None]:
# Let's try a 2D chart
# TSNE stands for t-distributed Stochastic Neighbor Embedding - it's a common technique for reducing dimensionality of data

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

In [None]:
# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=4, color=colors, opacity=0.7),
    text=[f"Category: {c}<br>Text: {d[:50]}..." for c, d in zip(categories, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vectorstore Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

In [None]:
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=2, color=colors, opacity=0.7),
    text=[f"Category: {c}<br>Text: {d[:50]}..." for c, d in zip(categories, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Load in the test pickle file

with open('../test.pkl', 'rb') as file:
    test = pickle.load(file)

In [None]:
def preprocess(item):
    message = f"Reply with a 2-3 sentence summary of this product. This will be used to find similar products so it should be clear, concise, complete. Details:\n{item}"
    messages = [{"role": "user", "content": message}]
    response = completion(model="groq/openai/gpt-oss-20b", messages=messages)
    return response.choices[0].message.content

In [None]:
print("ORIGINAL TEXT")
display(Markdown(test[1].text))
print("PREPROCESSED TEXT")
display(Markdown(preprocess(test[1].text)))

In [None]:
# We need to give some context to GPT-5-mini by selecting 5 products with similar descriptions

def make_context(similars, prices):
    message = "For context, here are some other items that might be similar to the item you need to estimate.\n\n"
    for similar, price in zip(similars, prices):
        message += f"Potentially related product:\n{similar}\nPrice is ${price:.2f}\n\n"
    return message

In [None]:
def messages_for(item, similars, prices):
    system_message = "You estimate prices of items. Reply only with the price, no explanation"
    user_prompt = make_context(similars, prices)
    user_prompt += "And now the question for you:\n\n"
    user_prompt += item.test_prompt().replace(" to the nearest dollar","").replace("\n\nPrice is $","")
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "Price is $"}
    ]

In [None]:
def vector(item):
    text = preprocess(item.text)
    return model.encode(text)

In [None]:
def find_similars(item):
    vec = vector(item)
    results = collection.query(query_embeddings=vec.astype(float).tolist(), n_results=5)
    documents = results['documents'][0][:]
    prices = [m['price'] for m in results['metadatas'][0][:]]
    return documents, prices

In [None]:
documents, prices = find_similars(test[1])
print(make_context(documents, prices))

In [None]:
# Utility function that extracts a price from a response from GPT-4o-mini

def get_price(s):
    s = s.replace('$','').replace(',','')
    match = re.search(r"[-+]?\d*\.\d+|\d+", s)
    return float(match.group()) if match else 0

In [None]:
get_price("blah blah the price is $99.99 blah")

In [None]:
# The function for gpt-5-mini

def gpt_4_1_mini_rag(item):
    documents, prices = find_similars(item)
    response = completion(model="gpt-4.1-mini", messages=messages_for(item, documents, prices), max_tokens=8)
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
# How much does the Fan Clutch in test[1] actually cost, on Amazon?

test[1].price

In [None]:
# Now let's call GPT-4o-mini using RAG, passing in 5 similar items from our Chroma datastore

gpt_4_1_mini_rag(test[1])

In [None]:
# Try Gemini 2.5 Flash

def gemini_rag(item):
    documents, prices = find_similars(item)
    response = completion(model="gemini/gemini-2.5-flash", messages=messages_for(item, documents, prices), max_tokens=8)
    reply = response.choices[0].message.content
    return get_price(reply)

In [None]:
gemini_rag(test[1])

In [None]:
evaluate(gpt_4_1_mini_rag, test)

In [None]:
root = logging.getLogger()
root.setLevel(logging.INFO)

In [None]:
from price_agents.frontier_agent import FrontierAgent

agent = FrontierAgent(collection)
agent.price("Quadcast HyperX condenser mic, connects via usb-c to your computer for crystal clear audio")

In [None]:
agent.price("Shure MV7+ professional podcaster microphone with usb-c and XLR outputs")