In [None]:
import os
import re
import math
import json
import joblib
from tqdm import tqdm
import torch
import random
from dotenv import load_dotenv
from huggingface_hub import login,HfApi
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from xgboost import XGBRegressor
from sklearn.manifold import TSNE

In [None]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')
DB = "../chroma_db"

In [None]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

In [None]:
HF_USER = "srajal87"
DATASET_NAME = f"{HF_USER}/lite-data"

In [None]:
dataset = load_dataset(DATASET_NAME)
train = dataset['train']
test = dataset['test']

In [None]:
print(train[0]["text"])

In [None]:
print(test[0]["text"])

In [None]:
# Load embedding model
model_embedding = SentenceTransformer("intfloat/e5-small-v2")

In [None]:
model_embedding

### Create a Chroma VectorStore
You can configure Chroma to save and load the database from your local machine, using the PersistentClient.

Data will be persisted automatically and loaded on start (if it exists).

Collections are where you'll store your embeddings, documents, and any additional metadata. Collections index your embeddings and documents, and enable efficient retrieval and filtering. You can create a collection with a name:

In [None]:
client = chromadb.PersistentClient(path=DB)

In [None]:
# Check if the collection exists and delete it if it does
collection_name = "price_items"
existing_collection_names = client.list_collections()

if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing collection: {collection_name}")

collection = client.create_collection(collection_name)

In [None]:
# Format description function (no price in text)
def description(item):
    text = item["text"].replace(
        "How much does this cost to the nearest dollar?\n\n", ""
    )
    text = text.split("\n\nPrice is $")[0]
    return f"passage: {text}"


description(train[0])

In [None]:
batch_size = 300  # how many items to insert into Chroma at once
encode_batch_size = 1024  # how many items to encode at once in GPU memory

for i in tqdm(range(0, len(train), batch_size), desc="Processing batches"):
    end_idx = min(i + batch_size, len(train))

    # Collect documents and metadata
    documents = [description(train[j]) for j in range(i, end_idx)]
    metadatas = [{"price": train[j]["price"]} for j in range(i, end_idx)]
    ids = [f"doc_{j}" for j in range(i, end_idx)]

    # GPU batch encoding
    vectors = model_embedding.encode(
        documents,
        batch_size=encode_batch_size,
        show_progress_bar=False,
        normalize_embeddings=True,
    ).tolist()

    # Insert into Chroma
    collection.add(
        ids=ids, documents=documents, embeddings=vectors, metadatas=metadatas
    )

print("✅ Embedding and storage to ChromaDB completed.")

##  Embedding-Based Regression with XGBoost

In [None]:
# Step 1: Load vectors and prices from Chroma
result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result["embeddings"])
documents = result["documents"]
prices = [meta["price"] for meta in result["metadatas"]]

In [None]:
# Step 2: Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0)
xgb_model.fit(vectors, prices)

In [None]:
# Step 3: Serialize XGBoost model locally for Hugging Face upload 
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
MODEL_DIR = os.path.join(ROOT, "models")
MODEL_FILENAME = "xgboost_model.pkl"
LOCAL_MODEL = os.path.join(MODEL_DIR, MODEL_FILENAME)

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(xgb_model, LOCAL_MODEL)

In [None]:
# Step 4: Push serialized XGBoost model to Hugging Face Hub
api = HfApi(token=hf_token)
REPO_NAME = "smart-deal-finder-models"
REPO_ID = f"{HF_USER}/{REPO_NAME}"

# Create the model repo if it doesn't exist
api.create_repo(repo_id=REPO_ID, repo_type="model", private=True, exist_ok=True)

# Upload the saved model
api.upload_file(
    path_or_fileobj=LOCAL_MODEL,
    path_in_repo=MODEL_FILENAME,
    repo_id=REPO_ID,
    repo_type="model",
)