# "THE PRICE IS RIGHT"
build a model that predicts how much something costs from a description, based on a scrape of Amazon data


A model that can estimate how much something costs, from its description.

## STEP 4: Neural Networks and LLMs

In [1]:
# imports

import os
from dotenv import load_dotenv
from huggingface_hub import login
from pricer.evaluator import evaluate
from litellm import completion
from pricer.items import Item
import numpy as np
from tqdm.notebook import tqdm
import csv
from sklearn.feature_extraction.text import HashingVectorizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import CosineAnnealingLR


In [2]:
LITE_MODE = True

load_dotenv(override=True)
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
username = "<username>"
dataset = f"{username}/items_lite" if LITE_MODE else f"{username}/items_full"

train, val, test = Item.from_hub(dataset)

print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")

Loaded 20,000 training items, 1,000 validation items, 1,000 test items


Saving as Dataset. This could have been done in previous notebook winstead of uploading to hub.

In [8]:
from datasets import Dataset, DatasetDict

ds_local = DatasetDict({
    "train": Dataset.from_list([item.model_dump() for item in train]),
    "validation": Dataset.from_list([item.model_dump() for item in val]),
    "test": Dataset.from_list([item.model_dump() for item in test]),
})
ds_local.save_to_disk("data_local") 

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving as CSV Files

In [10]:
import pandas as pd
os.makedirs('data_local_csv', exist_ok=True)  
for name, items in [("train", train), ("val", val), ("test", test)]:
    df = pd.DataFrame([item.model_dump() for item in items])
    df.to_csv(f"data_local_csv/{name}.csv", index=False)

To Load Data

In [None]:
from datasets import Dataset, DatasetDict

ds_local = DatasetDict({
    "train": Dataset.from_list([item.model_dump() for item in train]),
    "validation": Dataset.from_list([item.model_dump() for item in val]),
    "test": Dataset.from_list([item.model_dump() for item in test]),
})
ds_local.save_to_disk("data_local")  # Creates folder with arrow files

# a vanilla Neural Network

In [11]:
# Prepare our documents and prices

y = np.array([float(item.price) for item in train])
documents = [item.summary for item in train]

# Creating a Bag of Words or Vocabulary Vector for all the words
- The Vectorizer will fit the documents to vocabulary and then that will be the input size to the neural network

In [12]:
# Use the HashingVectorizer for a Bag of Words model
# Using binary=True with the CountVectorizer makes "one-hot vectors"

np.random.seed(42)
vectorizer = HashingVectorizer(n_features=5000, stop_words='english', binary=True)
X = vectorizer.fit_transform(documents)

In [37]:
# Define the neural network - 8 layer neural network

class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 64)
        self.layer4 = nn.Linear(64, 64)
        self.layer5 = nn.Linear(64, 64)
        self.layer6 = nn.Linear(64, 64)
        self.layer7 = nn.Linear(64, 64)
        self.layer8 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  # or 0.3 — typical range 0.1–0.5

    def forward(self, x):
        output1 = self.dropout(self.relu(self.layer1(x)))
        output2 = self.dropout(self.relu(self.layer2(output1)))
        output3 = self.dropout(self.relu(self.layer3(output2)))
        output4 = self.dropout(self.relu(self.layer4(output3)))
        output5 = self.dropout(self.relu(self.layer5(output4)))
        output6 = self.dropout(self.relu(self.layer6(output5)))
        output7 = self.dropout(self.relu(self.layer7(output6)))
        output8 = self.layer8(output7)   # No dropout before output
        return output8

Dropout randomly turns off some neurons during training (sets their outputs to 0). It’s used to reduce overfitting.
How it works
During training:
Each neuron has probability p (e.g. 0.2) of being dropped.
Dropped neurons output 0 and don’t contribute to the next layer.
The remaining outputs are scaled (e.g. by 1/(1-p)) so the overall activation level stays similar.
Which neurons are dropped changes each mini-batch.
During evaluation/inference: Dropout is turned off. All neurons are used so predictions are stable.

In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X.toarray())
y_train_tensor = torch.FloatTensor(y).unsqueeze(1)
 ## .unsqueeze(1) Helps to rotate the array and make it 2d but with one column and N rows, instead of N Columns 1 Row
 ## as we map the features in X rows with Y in index. Pytorch expects this way

In [None]:
X_train_tensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [26]:
y_train_tensor

tensor([[ 64.3000],
        [ 79.0000],
        [240.0000],
        ...,
        [ 50.5600],
        [ 35.0000],
        [189.9900]])

In [27]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.01, random_state=42)

# Create the loader, Create Pytorch Dataset and then use DataLoader that is a utility by pytorch to load data efficiently
# #  using batches while training
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [28]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x16612884bc0>

In [None]:
X_train_tensor.shape ## 20,000 Samples or rows and we selected 5000 Parameters or features in HashVectorizing

torch.Size([20000, 5000])

In [38]:
# Initialize the model
input_size = X_train_tensor.shape[1] ## 1 index is Input/features size that is a Bag of words here
model = NeuralNetwork(input_size)

In [31]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of trainable parameters: {trainable_params:,}")

Number of trainable parameters: 669,249


### Training

In [None]:
# Define loss function and optimizer

loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# We will do 2 complete runs through the data

EPOCHS = 2

for epoch in range(EPOCHS):
    model.train()
    for batch_X, batch_y in tqdm(train_loader):
        optimizer.zero_grad()

        # The next 4 lines are the 4 stages of training: forward pass, loss calculation, backward pass, optimize
        
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = loss_function(val_outputs, y_val)

    print(f'Epoch [{epoch+1}/{EPOCHS}], Train Loss: {loss.item():.3f}, Val Loss: {val_loss.item():.3f}')

  0%|          | 0/310 [00:00<?, ?it/s]

Epoch [1/5], Train Loss: 4615.657, Val Loss: 18490.104


  0%|          | 0/310 [00:00<?, ?it/s]

Epoch [2/5], Train Loss: 10089.146, Val Loss: 18530.018


  0%|          | 0/310 [00:00<?, ?it/s]

Epoch [3/5], Train Loss: 12340.702, Val Loss: 18411.512


  0%|          | 0/310 [00:00<?, ?it/s]

Epoch [4/5], Train Loss: 3192.434, Val Loss: 19065.061


  0%|          | 0/310 [00:00<?, ?it/s]

Epoch [5/5], Train Loss: 993.934, Val Loss: 17122.588


In [35]:
def neural_network(item):
    model.eval()
    with torch.no_grad():
        vector = vectorizer.transform([item.summary])
        vector = torch.FloatTensor(vector.toarray())
        result = model(vector)[0].item()
    return max(0, result)

In [36]:
evaluate(neural_network, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[91m$128 [91m$148 [92m$14 [92m$3 [92m$36 [91m$123 [92m$9 [91m$105 [93m$48 [91m$219 [91m$468 [91m$293 [93m$50 [91m$551 [92m$15 [92m$3 [92m$18 [92m$2 [91m$132 [91m$138 [92m$37 [92m$36 [91m$92 [91m$286 [93m$100 [91m$294 [91m$342 [93m$58 [91m$97 [92m$33 [92m$39 [91m$122 [93m$78 [92m$21 [91m$90 [92m$98 [92m$32 [92m$20 [91m$111 [91m$134 [91m$152 [92m$32 [92m$39 [92m$10 [91m$90 [93m$62 [92m$36 [93m$71 [92m$4 [93m$45 [92m$2 [92m$40 [92m$13 [93m$46 [91m$128 [93m$78 [92m$31 [91m$121 [93m$46 [92m$32 [91m$119 [92m$38 [92m$37 [92m$12 [91m$491 [91m$126 [92m$17 [92m$26 [93m$71 [91m$262 [92m$10 [92m$30 [91m$165 [93m$79 [92m$13 [93m$40 [91m$214 [93m$48 [92m$30 [93m$42 [93m$74 [91m$103 [92m$34 [92m$35 [92m$11 [91m$117 [91m$89 [91m$138 [92m$25 [91m$142 [92m$28 [93m$43 [92m$5 [92m$8 [91m$88 [91m$101 [92m$12 [91m$110 [91m$117 [92m$101 [93m$44 [93m$43 [92m$24 [92m$29 [92m$5 [92m$18 [91m$98 [91m$268 [

#### This is so far the best since we have 5000 features to deciside and with 8 layers we have 600k + Parameters that we can change and Grad Decent, RElu etc etc helps to optimise this

## LLM Methods

In [42]:
def messages_for(item):
    message = f"Estimate the price of this product. Respond with the price, no explanation\n\n{item.summary}"
    return [{"role": "user", "content": message}]

In [39]:
print(test[0].summary)

Title: Excess V2 Distortion/Modulation Pedal  
Category: Music Pedals  
Brand: Old Blood Noise  
Description: A versatile pedal offering distortion and three modulation modes—delay, chorus, and harmonized fifths—with full control over signal routing and expression.  
Details: Features include separate gain, tone, and volume controls; time, depth, and volume per modulation; order switching, soft‑touch bypass, and expression jack for dynamic control.


In [40]:
# The function for gpt-4.1-nano

def gpt_4__1_nano(item):
    response = completion(model="openai/gpt-4.1-nano", messages=messages_for(item))
    return response.choices[0].message.content

In [43]:
gpt_4__1_nano(test[0])

'$250'

In [44]:
test[0].price

219.0

#### As Noted Above, the prediction is off by 31$ the loss is definitely better than our own model

In [47]:
len(test)

1000

In [53]:
evaluate(gpt_4__1_nano, test, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

[92m$31 [92m$34 [92m$25 [92m$20 [92m$20 [93m$80 [92m$6 [91m$95 [92m$11 [91m$870 

### Trying the latest big model

In [54]:
# The function for gpt-5.1

def gpt_5__2(item):
    response = completion(model="gpt-5.2", messages=messages_for(item), reasoning_effort='high', seed=42)
    return response.choices[0].message.content

In [56]:
evaluate(gpt_5__2, test, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

[92m$30 [92m$34 [92m$25 [92m$10 [92m$20 [91m$100 [93m$54 [92m$35 [92m$6 [92m$1 