In [1]:
import pandas as pd
import random
from sentence_transformers import SentenceTransformer
import chromadb



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# ----------------------------
# Step 1: Generate synthetic dataset
# ----------------------------
n_rows = 200
data = {
    "InvoiceNo": [f"INV{1000+i}" for i in range(n_rows)],
    "StockCode": [f"STK{random.randint(100, 999)}" for _ in range(n_rows)],
    "Description": [random.choice(["T-shirt", "Shoes", "Laptop", "Phone", "Book", "Pen", "Bag"]) for _ in range(n_rows)],
    "Quantity": [random.randint(1, 10) for _ in range(n_rows)],
    "InvoiceDate": pd.date_range(start="2023-01-01", periods=n_rows, freq="D").strftime("%Y-%m-%d").tolist(),
    "UnitPrice": [round(random.uniform(5, 500), 2) for _ in range(n_rows)],
    "CustomerID": [random.randint(10000, 20000) for _ in range(n_rows)],
    "Country": [random.choice(["India", "USA", "UK", "Germany", "Canada"]) for _ in range(n_rows)]
}

df = pd.DataFrame(data)
df.to_csv("synthetic_retail_data.csv", index=False)
print("✅ synthetic_retail_data.csv generated successfully!")



✅ synthetic_retail_data.csv generated successfully!


In [3]:
# ----------------------------
# Step 2: Convert rows into chunks
# ----------------------------
chunks = []
for idx, row in df.iterrows():
    chunk_text = f"InvoiceNo: {row['InvoiceNo']}, " \
                 f"StockCode: {row['StockCode']}, " \
                 f"Description: {row['Description']}, " \
                 f"Quantity: {row['Quantity']}, " \
                 f"InvoiceDate: {row['InvoiceDate']}, " \
                 f"UnitPrice: {row['UnitPrice']}, " \
                 f"CustomerID: {row['CustomerID']}, " \
                 f"Country: {row['Country']}"
    chunks.append(chunk_text)

print("Sample chunks:", chunks[:3])



Sample chunks: ['InvoiceNo: INV1000, StockCode: STK992, Description: T-shirt, Quantity: 10, InvoiceDate: 2023-01-01, UnitPrice: 46.86, CustomerID: 13990, Country: USA', 'InvoiceNo: INV1001, StockCode: STK765, Description: Pen, Quantity: 4, InvoiceDate: 2023-01-02, UnitPrice: 311.23, CustomerID: 17835, Country: Canada', 'InvoiceNo: INV1002, StockCode: STK894, Description: Pen, Quantity: 2, InvoiceDate: 2023-01-03, UnitPrice: 114.29, CustomerID: 19268, Country: USA']


In [4]:
# ----------------------------
# Step 3: Create embeddings
# ----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

print(f"Embeddings shape: {len(embeddings)} x {len(embeddings[0])}")










Embeddings shape: 200 x 384


In [5]:
# ----------------------------
# Step 4: Store in ChromaDB with metadata
client = chromadb.PersistentClient(path="chromadb_store")
collection = client.get_or_create_collection("retail_chunks")


In [6]:
# Optional: clear old data only if collection already has entries
if collection.count() > 0:
    all_ids = collection.get()["ids"]
    if all_ids:  # avoid crash if empty
        collection.delete(ids=all_ids)

# Insert data with metadata
for i, (text, emb) in enumerate(zip(chunks, embeddings)):
    row = df.iloc[i]
    collection.add(
        ids=[str(i)],
        documents=[text],
        embeddings=[emb],
        metadatas=[{
            "InvoiceNo": row["InvoiceNo"],
            "StockCode": row["StockCode"],
            "Description": row["Description"],
            "Quantity": int(row["Quantity"]),
            "InvoiceDate": row["InvoiceDate"],
            "UnitPrice": float(row["UnitPrice"]),
            "CustomerID": int(row["CustomerID"]),
            "Country": row["Country"]
        }]
    )

print("✅ Data stored in ChromaDB successfully!")
print("Stored embeddings:", collection.count())

✅ Data stored in ChromaDB successfully!
Stored embeddings: 200


In [7]:
# -----------------------------
# Step 6: Evaluate Retrieval with Recall@k
# -----------------------------

def recall_at_k(collection, model, query, ground_truth_ids, k=5):
    query_emb = model.encode([query])
    results = collection.query(query_embeddings=query_emb, n_results=k)
    retrieved_ids = results["ids"][0]

    relevant_retrieved = set(retrieved_ids) & set(ground_truth_ids)
    recall = len(relevant_retrieved) / len(ground_truth_ids) if ground_truth_ids else 0.0
    return recall, retrieved_ids

In [8]:
# Example evaluation queries
queries = ["Laptop", "Shoes", "Book"]
for q in queries:
    ground_truth_ids = [str(i) for i, desc in enumerate(df["Description"]) if q in desc]
    recall, retrieved = recall_at_k(collection, model, query=q, ground_truth_ids=ground_truth_ids, k=5)
    print(f"Query: {q}")
    print(f"  Recall@5 = {recall:.2f}")
    print(f"  Retrieved IDs: {retrieved}")
    print(f"  Ground Truth IDs (first 5): {ground_truth_ids[:5]}")
    print("-" * 40)

Query: Laptop
  Recall@5 = 0.17
  Retrieved IDs: ['108', '76', '132', '58', '158']
  Ground Truth IDs (first 5): ['11', '18', '19', '34', '35']
----------------------------------------
Query: Shoes
  Recall@5 = 0.14
  Retrieved IDs: ['137', '170', '134', '180', '127']
  Ground Truth IDs (first 5): ['38', '46', '53', '56', '63']
----------------------------------------
Query: Book
  Recall@5 = 0.17
  Retrieved IDs: ['6', '101', '29', '161', '177']
  Ground Truth IDs (first 5): ['6', '8', '10', '12', '16']
----------------------------------------


In [9]:
# Example query
results = collection.query(
    query_texts=["cheap book"],  # user query
    n_results=5                    # top 5 results
)

print("🔎 Query Results:")
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"Match: {doc}\nDistance: {dist:.4f}\n")


C:\Users\DELL\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:20<00:00, 3.96MiB/s]


🔎 Query Results:
Match: InvoiceNo: INV1101, StockCode: STK667, Description: Book, Quantity: 7, InvoiceDate: 2023-04-12, UnitPrice: 272.87, CustomerID: 19516, Country: India
Distance: 1.5735

Match: InvoiceNo: INV1029, StockCode: STK275, Description: Book, Quantity: 6, InvoiceDate: 2023-01-30, UnitPrice: 486.86, CustomerID: 18583, Country: UK
Distance: 1.5749

Match: InvoiceNo: INV1006, StockCode: STK273, Description: Book, Quantity: 7, InvoiceDate: 2023-01-07, UnitPrice: 497.06, CustomerID: 14216, Country: USA
Distance: 1.5888

Match: InvoiceNo: INV1129, StockCode: STK937, Description: Book, Quantity: 4, InvoiceDate: 2023-05-10, UnitPrice: 282.27, CustomerID: 14666, Country: Canada
Distance: 1.6107

Match: InvoiceNo: INV1161, StockCode: STK372, Description: Book, Quantity: 5, InvoiceDate: 2023-06-11, UnitPrice: 489.06, CustomerID: 16875, Country: Canada
Distance: 1.6112



In [43]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,INV1000,STK939,Shoes,4,2023-01-01,177.13,15314,Germany
1,INV1001,STK307,Pen,10,2023-01-02,67.86,11443,Germany
2,INV1002,STK280,Pen,7,2023-01-03,427.54,12185,Germany
3,INV1003,STK354,Book,4,2023-01-04,489.48,14723,Germany
4,INV1004,STK928,Book,8,2023-01-05,179.14,15712,India
...,...,...,...,...,...,...,...,...
195,INV1195,STK776,Bag,5,2023-07-15,390.82,13737,Germany
196,INV1196,STK787,Shoes,1,2023-07-16,298.67,11518,USA
197,INV1197,STK826,Pen,3,2023-07-17,212.68,12003,Canada
198,INV1198,STK777,Shoes,2,2023-07-18,409.72,10639,India
