# The Product Pricer

A model that can estimate how much something costs, from its description.

## Data Curation Part 2

The dataset:  
https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023

And the folder with all the product datasets:  
https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main/raw/meta_categories

## Forming individual dataset

Creating individual datasets for each category so as finetuning becomes easier to train.

In [1]:
# imports

import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle

In [2]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
os.environ['WANDB_API_KEY'] = os.getenv('WANDB_API_KEY', 'your-key-if-not-using-env')

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
%matplotlib inline

In [13]:
# dataset_names = [
#     "Appliances",
#     "Automotive",
#     "Baby_Products",
#     "Electronics",
#     "Home_and_Kitchen",
#     "Office_Products",
#     "Tools_and_Home_Improvement",
#     "Cell_Phones_and_Accessories",
#     "Toys_and_Games",
#     "Software",
#     "Musical_Instruments",
# ]
dataset_names = [
    "Appliances",
    "Automotive",
    "Baby_Products",
    "Electronics",
    "Office_Products",
    "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Software",
    "Musical_Instruments",
]

In [6]:
def restructure_data(category, list_value):
    slots = defaultdict(list)
    for item in list_value:
        slots[round(item.price)].append(item)
    
    np.random.seed(42)
    random.seed(42)
    sample = []
    for i in range(1, 1000):
        slot = slots[i]
        if i>=240:
            sample.extend(slot)
        elif len(slot) <= 1200:
            sample.extend(slot)
        else:
            weights = np.array([1 if item.category=='Automotive' else 5 for item in slot])
            weights = weights / np.sum(weights)
            selected_indices = np.random.choice(len(slot), size=1200, replace=False, p=weights)
            selected = [slot[i] for i in selected_indices]
            sample.extend(selected)
    print(f"There are {len(sample):,} items in the {category} sample")
    return sample
    

In [7]:
def make_train_test_dataset(category, sample):
    random.seed(42)
    random.shuffle(sample)
    
    list_length = len(sample)
    ten_percent = list_length * 0.1
    test_length = int(list_length - ten_percent)

    train_set = sample[:test_length]
    test_set = sample[test_length:]
    print(f"Divided {category} into a training set of {len(train_set):,} items and test set of {len(test_set):,} items")
    return train_set, test_set

In [8]:
def save_dataset(category, train_set, test_set):
    train_prompts = [item.prompt for item in train_set]
    train_prices = [item.price for item in train_set]
    test_prompts = [item.test_prompt() for item in test_set]
    test_prices = [item.price for item in test_set]
    
    dataset_dict = {
        'train_dataset':{"text": train_prompts, "price": train_prices},
        'test_dataset':{"text": test_prompts, "price": test_prices}
    }
    
    dataset = Dataset.from_dict(dataset_dict)
    dataset.save_to_disk(f"datasets/{category}/train_set")
    
    # train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
    # test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
    
    # train_dataset.save_to_disk(f"datasets/{category}/train_set")
    # test_dataset.save_to_disk(f"datasets/{category}/test_set")
    # return dataset

In [9]:
dataset_dict = {}
category_item = defaultdict(list)

# # items = []
# for dataset_name in dataset_names:
#     loader = ItemLoader(dataset_name)
#     category_item[dataset_name].extend(loader.load())
#     # items.extend(loader.load())
# print(category_item.keys())

for category in dataset_names:
    loader = ItemLoader(category)
    result_data = loader.load()
    sample_set = restructure_data(category, result_data)
    train, test = make_train_test_dataset(category, sample_set)
    save_dataset(category, train, test)
    # dataset_dict[category] = data_dict    

Loading dataset Office_Products


100%|██████████| 711/711 [03:45<00:00,  3.15it/s]


Completed Office_Products with 240,394 datapoints in 3.9 mins
There are 88,278 items in the Office_Products sample
Divided Office_Products into a training set of 79,450 items and test set of 8,828 items


Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

Loading dataset Cell_Phones_and_Accessories


100%|██████████| 1289/1289 [06:55<00:00,  3.10it/s]


Completed Cell_Phones_and_Accessories with 238,869 datapoints in 7.1 mins
There are 49,098 items in the Cell_Phones_and_Accessories sample
Divided Cell_Phones_and_Accessories into a training set of 44,188 items and test set of 4,910 items


Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

Loading dataset Toys_and_Games


100%|██████████| 891/891 [04:46<00:00,  3.11it/s]


Completed Toys_and_Games with 340,479 datapoints in 4.9 mins
There are 100,062 items in the Toys_and_Games sample
Divided Toys_and_Games into a training set of 90,055 items and test set of 10,007 items


Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

Loading dataset Software


meta_Software.jsonl:   0%|          | 0.00/256M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

100%|██████████| 90/90 [00:36<00:00,  2.48it/s]


Completed Software with 14,635 datapoints in 1.0 mins
There are 8,628 items in the Software sample
Divided Software into a training set of 7,765 items and test set of 863 items


Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

Loading dataset Musical_Instruments


100%|██████████| 214/214 [01:15<00:00,  2.84it/s]


Completed Musical_Instruments with 66,829 datapoints in 1.3 mins
There are 60,715 items in the Musical_Instruments sample
Divided Musical_Instruments into a training set of 54,643 items and test set of 6,072 items


Saving the dataset (0/1 shards):   0%|          | 0/2 [00:00<?, ? examples/s]

In [14]:
dataset_dict = {}
for category in dataset_names:
    data_set = Dataset.load_from_disk(f"datasets/{category}/train_set")
    dataset_dict[category] = data_set
    
print(dataset_dict.keys())
    
dataset = DatasetDict(dataset_dict)
HF_USER = "shantanubharadwaj"
DATASET_NAME = f"{HF_USER}/amazon_pricer"
dataset.push_to_hub(DATASET_NAME, private=True)

with open('dataset.pkl', 'wb') as file:
    pickle.dump(dataset, file)

dict_keys(['Appliances', 'Automotive', 'Baby_Products', 'Electronics', 'Office_Products', 'Cell_Phones_and_Accessories', 'Toys_and_Games', 'Software', 'Musical_Instruments'])


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# dataset = DatasetDict(dataset_dict)

# HF_USER = "shantanubharadwaj"
# DATASET_NAME = f"{HF_USER}/test_pricer_combined"
# dataset.push_to_hub(DATASET_NAME, private=True)

# with open('dataset.pkl', 'wb') as file:
#     pickle.dump(dataset, file)

In [None]:
# with open('train.pkl', 'wb') as file:
#     pickle.dump(train, file)

# with open('test.pkl', 'wb') as file:
#     pickle.dump(test, file)

In [None]:
# def upload_dataset():
#     dataset_dict = {}
    
    
#     dataset = DatasetDict({
#         "train": train_dataset,
#         "test": test_dataset
#     })

In [43]:
!pwd

/Users/shantanudutta/Documents/Study/AI/practice_repo/llm_agents_engg/fine_tuning


In [30]:
sample_list = []
for i in range(1,1001):
    sample_list.append(i)

print(len(sample_list))

1000


In [39]:
list_length = len(sample_list)
ten_percent = list_length * 0.1
test_length = int(list_length - ten_percent)
print(test_length)
test_set = sample_list[:test_length]
train_set = sample_list[test_length:]

900


In [40]:
test_set

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [41]:
print(test_set[-10:])

[891, 892, 893, 894, 895, 896, 897, 898, 899, 900]


In [42]:
print(train_set[-10:])

[991, 992, 993, 994, 995, 996, 997, 998, 999, 1000]
