In [None]:
# %cd ..

In [None]:
# import os
# print(os.getcwd())

In [None]:
%load_ext autoreload
%autoreload 2

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from sklearn.manifold import TSNE
import plotly.graph_objects as go

import sys
sys.path.append(os.path.abspath('..'))
from utils.items import Item

# %matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
DB = "DB_products_vectorstore"

In [3]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Loading the training data

In [6]:
# With train.pkl in this folder, you can run this:

with open('../Data/train.pkl', 'rb') as file:
    train = pickle.load(file)

In [9]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

WeatherTech Custom Fit FloorLiners for Hyundai Palisade - 1st & 2nd Row Grey
Fits select Hyundai Palisade Models. Please confirm fitment to your year, make, and model in Amazon ConfirmedFit prior to purchase. Coverage Area 1st & 2nd Row, Color Grey The FloorLiner is proudly designed, engineered, and manufactured with American made tooling in the USA. A patented High-Density Tri-Extruded (HDTE) material allows for a rigid core for strength while offering surface friction to the carpet, as well as a tactile feel to the surface! Advanced surfacing creates channels that carry fluids and debris to a lower reservoir with further channeling to help minimize fluid movement while driving! Images may be representative and may not reflect the actual vehicle or part. Manufacturer

Price is $218.00


In [10]:
len(train)

400000

## Create a Chroma Datastore

In [11]:
client = chromadb.PersistentClient(path="../Data/chromaDB")

In [12]:
# Check if the collection exists
collection_name = "Amazon_reviews_products"
# existing_collection_names = [collection.name for collection in client.list_collections()]
# if collection_name in existing_collection_names:
#     client.delete_collection(collection_name)
#     print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)

## Local Embedding: SentenceTransfomer

The all-MiniLM is a model from HuggingFace that maps sentences and paragraphs to a 384 dimensional dense vector space and is ideal for semantic search.

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

Benefits: it's free, fast and runs locally.

In [13]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [23]:
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

In [24]:
description(train[0])

'WeatherTech Custom Fit FloorLiners for Hyundai Palisade - 1st & 2nd Row Grey\nFits select Hyundai Palisade Models. Please confirm fitment to your year, make, and model in Amazon ConfirmedFit prior to purchase. Coverage Area 1st & 2nd Row, Color Grey The FloorLiner is proudly designed, engineered, and manufactured with American made tooling in the USA. A patented High-Density Tri-Extruded (HDTE) material allows for a rigid core for strength while offering surface friction to the carpet, as well as a tactile feel to the surface! Advanced surfacing creates channels that carry fluids and debris to a lower reservoir with further channeling to help minimize fluid movement while driving! Images may be representative and may not reflect the actual vehicle or part. Manufacturer'

In [25]:
for i in tqdm(range(0, len(train), 1000)):
    documents = [description(item) for item in train[i: i+1000]]
    vectors = model.encode(documents).astype(float).tolist()
    metadatas = [{"category": item.category, "price": item.price} for item in train[i: i+1000]]
    ids = [f"doc_{j}" for j in range(i, i+1000)]
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=vectors,
        metadatas=metadatas
    )

100%|██████████| 400/400 [2:29:37<00:00, 22.44s/it]  


## Embeddings Visualization

In [26]:
# client = chromadb.PersistentClient(path="../Data/chromaDB")
# collection_name = "Amazon_reviews_products"
collection = client.get_or_create_collection(collection_name)

In [27]:
CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories', 'Electronics','Musical_Instruments', 'Office_Products', 'Tools_and_Home_Improvement', 'Toys_and_Games']
COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']

In [32]:
MAXIMUM_DATAPOINTS = 20_000
result = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=MAXIMUM_DATAPOINTS)
vectors = np.array(result['embeddings'])
documents = result['documents']
categories = [metadata['category'] for metadata in result['metadatas']]
colors = [COLORS[CATEGORIES.index(c)] for c in categories]

## 2D Embeddings Visualization using TSNE

In [33]:
tsne = TSNE(n_components=2, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)

In [34]:
# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
)])

fig.update_layout(
    title='2D Chroma Vectorstore Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## 3D Embeddings Visualization using TSNE

In [35]:
tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)

In [36]:
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()