In [6]:
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer

# Load the embedding model
# 📌 We used the 'all-MiniLM-L6-v2' model from SentenceTransformers for the following reasons:
# - It is lightweight, fast, and optimized for semantic similarity tasks (perfect for use in vector databases).
# - It balances performance and speed, making it ideal for interactive querying.
# - It provides embeddings with 384 dimensions — efficient for both local use and small to medium datasets.
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# There are more powerful alternatives available based on our needs:
# 1. 'all-mpnet-base-v2' – Higher accuracy, larger size (~768 dims).
# 2. 'paraphrase-MiniLM-L12-v2' – Slightly better than L6-v2, still fast.
# 3. OpenAI embeddings – Very powerful but not free.
# 4. Cohere embeddings – Also strong and fast, but paid.

#  For OpenAI's embedding model ('text-embedding-ada-002'):
# - Pricing: $0.0001 per 1K tokens (as of now)
# - Example: 100,000 records of 100 tokens each = ~$1.00

# We chose 'all-MiniLM-L6-v2' because:
# - It's open-source and free to use
# - It doesn't require internet/API access (can be used offline)
# - Perfect for prototyping and mid-size production workloads


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:

from google.colab import files
uploaded = files.upload()

Saving customer.csv to customer.csv
Saving marketing_campaign.csv to marketing_campaign.csv
Saving sales.csv to sales.csv


In [9]:
sales_df = pd.read_csv("sales.csv")
customer_df = pd.read_csv("customer.csv")
campaign_df = pd.read_csv("marketing_campaign.csv")


In [10]:
#1.Chunking the Data
def chunk_sales_data(row):
    return (
        f"Customer {row['customer_id']} purchased a {row['cartype']} car "
        f"({row['model']} - {row['model_variant']}) powered by {row['fuel_variant']} fuel in "
        f"{row['city']}, {row['region']} on {row['sale_date']}. The sale amount was ₹{row['sale_amount']}."
    )


def chunk_customer_data(row):
    return (
        f"Customer {row['customer_id']} is a {row['age']} year old {row['gender']}. "
        f"They prefer the {row['preferred_model']} ({row['preferred_model_variant']}) model, "
        f"a {row['preferred_cartype']} car with {row['preferred_fuel_variant']} fuel type. "
        f"Their purchase type is {row['purchase_type']}."
    )


def chunk_campaign_data(row):
    return (
        f"The campaign '{row['campaign_name']}' (ID: {row['campaign_id']}) ran from {row['start_date']} "
        f"to {row['end_date']}, targeting {row['target_audience']} audiences through "
        f"{row['marketing_channel']} channel. It focused on {row['target_cartype']} vehicles, "
        f"offered a {row['discount_percent']}% discount, and aimed for {row['expected_sales']} sales. "
        f"It achieved {row['actual_sales']} sales."
    )



In [11]:
#2.Converting Chunks to Lists
sales_chunks = sales_df.apply(chunk_sales_data, axis=1).tolist()
customer_chunks = customer_df.apply(chunk_customer_data, axis=1).tolist()
campaign_chunks = campaign_df.apply(chunk_campaign_data, axis=1).tolist()

sales_metadata = sales_df[["city", "cartype", "model"]].rename(columns={"cartype": "car_type"}).to_dict(orient="records")
customer_metadata = customer_df.to_dict(orient="records")
campaign_metadata = campaign_df.to_dict(orient="records")


In [12]:
#3.Creating Metadata for Each Chunk
# Sales
#4. Creating ChromaDB Collections
sales_collection = chroma_client.get_or_create_collection("sales_data")
#5. Generating Embeddings
sales_embeddings = embedding_model.encode(sales_chunks).tolist()
#6. Storing Everything in ChromaDB
sales_collection.add(
    documents=sales_chunks,
    embeddings=sales_embeddings,
    ids=[f"sale_{i}" for i in range(len(sales_chunks))],
    metadatas=sales_metadata
)

# Customers
customer_collection = chroma_client.get_or_create_collection("customer_data")
customer_embeddings = embedding_model.encode(customer_chunks).tolist()
customer_collection.add(
    documents=customer_chunks,
    embeddings=customer_embeddings,
    ids=[f"cust_{i}" for i in range(len(customer_chunks))],
    metadatas=customer_metadata
)

# Campaigns
campaign_collection = chroma_client.get_or_create_collection("campaign_data")
campaign_embeddings = embedding_model.encode(campaign_chunks).tolist()
campaign_collection.add(
    documents=campaign_chunks,
    embeddings=campaign_embeddings,
    ids=[f"camp_{i}" for i in range(len(campaign_chunks))],
    metadatas=campaign_metadata
)

print("✅ All datasets embedded and stored in ChromaDB!")


✅ All datasets embedded and stored in ChromaDB!


In [15]:
#7. Querying the Data
results = sales_collection.query(
    query_texts=["Find records of SUV car purchases made in Bangalore"],
    n_results=5,
    where={"$and": [{"city": "Bangalore"}, {"car_type": "SUV"}]},
    include=["documents", "metadatas", "distances"]
)


In [17]:
for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
    print("📄", doc)
    print(f"📍 City: {meta.get('city')} | 🚗 Model: {meta.get('model')}")
    print("---")


📄 Customer CUST079 purchased a SUV car (Maruti Suzuki Brezza - Mid) powered by Diesel fuel in Bangalore, South on 2024-11-12. The sale amount was ₹4552301.01.
📍 City: Bangalore | 🚗 Model: Maruti Suzuki Brezza
---
📄 Customer CUST181 purchased a SUV car (Kia Seltos - Base) powered by Diesel fuel in Bangalore, North-West on 2024-01-26. The sale amount was ₹1316578.88.
📍 City: Bangalore | 🚗 Model: Kia Seltos
---
📄 Customer CUST181 purchased a SUV car (MG Hector - Mid) powered by Hybrid fuel in Bangalore, South-West on 2024-08-02. The sale amount was ₹2617390.21.
📍 City: Bangalore | 🚗 Model: MG Hector
---
📄 Customer CUST056 purchased a SUV car (Renault Duster - Base) powered by CNG fuel in Bangalore, South-West on 2023-10-22. The sale amount was ₹3522282.92.
📍 City: Bangalore | 🚗 Model: Renault Duster
---


In [18]:
results = customer_collection.query(
    query_texts=["customers who prefer diesel SUV"],
    n_results=5,
    where={"$and": [{"preferred_cartype": "SUV"}, {"preferred_fuel_variant": "Diesel"}]},
    include=["documents", "metadatas", "distances"]
)

for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
    print("🧑‍💼 Customer ID:", meta.get('customer_id'))
    print("🚙 Preferred Car:", meta.get('preferred_model'), "| 🔋 Fuel:", meta.get('preferred_fuel_variant'))
    print(f"📝 Doc: {doc}")


🧑‍💼 Customer ID: CUST061
🚙 Preferred Car: Maruti Suzuki Brezza | 🔋 Fuel: Diesel
📝 Doc: Customer CUST061 is a 54 year old Other. They prefer the Maruti Suzuki Brezza (Mid) model, a SUV car with Diesel fuel type. Their purchase type is existing.
🧑‍💼 Customer ID: CUST086
🚙 Preferred Car: Toyota Fortuner | 🔋 Fuel: Diesel
📝 Doc: Customer CUST086 is a 67 year old Male. They prefer the Toyota Fortuner (Top) model, a SUV car with Diesel fuel type. Their purchase type is existing.
🧑‍💼 Customer ID: CUST168
🚙 Preferred Car: Honda City | 🔋 Fuel: Diesel
📝 Doc: Customer CUST168 is a 31 year old Other. They prefer the Honda City (Mid) model, a SUV car with Diesel fuel type. Their purchase type is existing.
🧑‍💼 Customer ID: CUST027
🚙 Preferred Car: Hyundai Creta | 🔋 Fuel: Diesel
📝 Doc: Customer CUST027 is a 60 year old Other. They prefer the Hyundai Creta (Base) model, a SUV car with Diesel fuel type. Their purchase type is existing.
🧑‍💼 Customer ID: CUST133
🚙 Preferred Car: Hyundai Creta | 🔋 Fuel: Di

In [19]:
results = campaign_collection.query(
    query_texts=["campaigns for hatchback cars with high discount"],
    n_results=5,
    where={"target_cartype": "Hatchback"},
    include=["documents", "metadatas", "distances"]
)

for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
    print("📢 Campaign:", meta.get('campaign_name'))
    print("🎯 Target:", meta.get('target_cartype'), "| 💸 Discount:", meta.get('discount_percent'))
    print(f"📝 Doc: {doc}")


📢 Campaign: Festive Offer
🎯 Target: Hatchback | 💸 Discount: 24.66
📝 Doc: The campaign 'Festive Offer' (ID: CAMP048) ran from 2023-12-07 to 2025-02-04, targeting First-Time Buyers audiences through TV channel. It focused on Hatchback vehicles, offered a 24.66% discount, and aimed for 125 sales. It achieved 87 sales.
📢 Campaign: Electric Revolution Discount
🎯 Target: Hatchback | 💸 Discount: 27.07
📝 Doc: The campaign 'Electric Revolution Discount' (ID: CAMP031) ran from 2024-02-10 to 2024-07-13, targeting SUV Enthusiasts audiences through Billboards channel. It focused on Hatchback vehicles, offered a 27.07% discount, and aimed for 135 sales. It achieved 112 sales.
📢 Campaign: Electric Revolution Discount
🎯 Target: Hatchback | 💸 Discount: 23.63
📝 Doc: The campaign 'Electric Revolution Discount' (ID: CAMP049) ran from 2024-08-27 to 2024-10-26, targeting City Commuters audiences through Social Media channel. It focused on Hatchback vehicles, offered a 23.63% discount, and aimed for 330 sale

In [None]:
# All embedded data is stored **locally** in a folder named 'chroma_db'
# - This includes collections, embeddings, and metadata.
# - ChromaDB supports persistent storage using SQLite + Parquet.
# - Easy to move or sync this to cloud storage (e.g.S3) if needed later.

import os

for root, dirs, files in os.walk("chroma_db"):
    level = root.replace("chroma_db", "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")



chroma_db/
    chroma.sqlite3
    f47c22ef-9311-43af-9329-623c8ffe0588/
        link_lists.bin
        data_level0.bin
        length.bin
        header.bin
    39ac070e-b2ab-4a88-8510-b985bacc42b8/
        link_lists.bin
        data_level0.bin
        length.bin
        header.bin
    378c924e-1b30-4ef4-9463-384c74ab38b6/
        link_lists.bin
        data_level0.bin
        length.bin
        header.bin
