In [45]:
from datasets.arrow_dataset import Dataset
import datasets

dataset = datasets.load_dataset("ed-donner/items_full")

In [46]:
dataset_train = dataset["train"]
dataset_val = dataset["validation"]
dataset_test = dataset["test"]

In [47]:
dataset_train

Dataset({
    features: ['title', 'category', 'price', 'full', 'weight', 'summary', 'prompt', 'id'],
    num_rows: 800000
})

In [48]:
X_train = dataset_train.to_pandas()["summary"]
X_test = dataset_test.to_pandas()["summary"]

y_train = dataset_train.to_pandas()["price"]
y_test = dataset_test.to_pandas()["price"]

y_train

0          64.30
1          79.00
2         240.00
3         449.00
4          79.99
           ...  
799995     65.29
799996    115.54
799997     38.99
799998     59.99
799999     29.99
Name: price, Length: 800000, dtype: float64

In [49]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer


# vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
vectorizer = HashingVectorizer(n_features=5000, stop_words='english', binary=True)


In [50]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [28]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from torch.utils.data import Dataset

class PriceDataSet(Dataset):
    def __init__(self, X_sparse, y):
        self.X = X_sparse
        self.y = y
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx].toarray().flatten(), dtype=torch.float32).to(device), torch.tensor(self.y[idx], dtype=torch.float32).to(device)

In [29]:
from torch.utils.data import DataLoader

train_data = PriceDataSet(X_train, y_train)
test_data = PriceDataSet(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=500, shuffle=True)
test_loader = DataLoader(test_data, batch_size=500, shuffle=False)

In [30]:
import torch.nn as nn
class PriceNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 3000),
            nn.ReLU(),
            nn.Linear(3000,2000),
            nn.ReLU(),
            nn.Linear(2000, 2000),
            nn.ReLU(),
            nn.Linear(2000, 1),
            nn.ReLU()
        ).to(device)
    
    def forward(self, x):
        return self.net(x)

In [31]:
input_dim = X_train.shape[1]
model = PriceNN(input_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.MSELoss()
epochs = 15

for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1)) 
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


Epoch 1/15, Loss: 9421.5166015625
Epoch 2/15, Loss: 7786.18896484375
Epoch 3/15, Loss: 4054.748779296875
Epoch 4/15, Loss: 2759.704833984375
Epoch 5/15, Loss: 1941.8221435546875
Epoch 6/15, Loss: 1600.57763671875
Epoch 7/15, Loss: 1659.422119140625
Epoch 8/15, Loss: 1050.943603515625
Epoch 9/15, Loss: 982.0099487304688
Epoch 10/15, Loss: 817.6024169921875
Epoch 11/15, Loss: 580.3348388671875
Epoch 12/15, Loss: 516.7081298828125
Epoch 13/15, Loss: 613.0408325195312
Epoch 14/15, Loss: 541.5647583007812
Epoch 15/15, Loss: 398.8338317871094


In [32]:

print(model)

PriceNN(
  (net): Sequential(
    (0): Linear(in_features=5000, out_features=3000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=3000, out_features=2000, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2000, out_features=2000, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2000, out_features=1, bias=True)
    (7): ReLU()
  )
)


In [33]:
def predict_price(model, X):
    model.eval()
    X_tensor = torch.tensor(X.toarray(), dtype=torch.float32).to(device)
    with torch.no_grad():
        y_pred = model(X_tensor)
    return y_pred

y_pred = predict_price(model, X_test).flatten().cpu().numpy()

In [34]:
import plotly.express as px

px.scatter(x=y_pred, y=y_test, title="Predicted vs Actual", labels={"x": "Predicted", "y": "Actual"})

In [35]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

print(mean_squared_error(y_pred, y_test))
print(mean_absolute_error(y_pred, y_test))

8279.20670158208
53.0623630828743


In [328]:
y_test.shape

(1000,)

In [38]:
torch.save(model.state_dict(), "C:\\Users\\srini\\my-python-projects\\llm_engineering_Jan_2026\\week6\\pricer\\nn_price_model\\nn_price_model.pth")

In [None]:
model = PriceNN(X_train.shape[1]).to(device)
model.load_state_dict(torch.load("C:\\Users\\srini\\my-python-projects\\llm_engineering_Jan_2026\\week6\\pricer\\nn_price_model\\nn_price_model.pth"))
model.eval()


PriceNN(
  (net): Sequential(
    (0): Linear(in_features=5000, out_features=3000, bias=True)
    (1): ReLU()
    (2): Linear(in_features=3000, out_features=2000, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2000, out_features=2000, bias=True)
    (5): ReLU()
    (6): Linear(in_features=2000, out_features=1, bias=True)
    (7): ReLU()
  )
)

In [41]:
y_pred = predict_price(model, X_test).flatten().cpu().numpy()

In [42]:
import plotly.express as px

px.scatter(x=y_pred, y=y_test, title="Predicted vs Actual", labels={"x": "Predicted", "y": "Actual"})

In [43]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

print(mean_squared_error(y_pred, y_test))
print(mean_absolute_error(y_pred, y_test))

8279.20670158208
53.0623630828743


In [44]:
X_train.shape[1]

5000

In [52]:
import joblib
joblib.dump(vectorizer, r'C:\Users\srini\my-python-projects\llm_engineering_Jan_2026\week6\pricer\nn_price_model\hashing_vectorizer.joblib')


['C:\\Users\\srini\\my-python-projects\\llm_engineering_Jan_2026\\week6\\pricer\\nn_price_model\\hashing_vectorizer.joblib']