In [1]:
import os
import pandas as pd
import openai
import chromadb
from chromadb.config import Settings
from tqdm import tqdm

# Set your OpenAI API key (keep your key secure)
# Set the persistence directory via an environment variable.
os.environ["CHROMA_DB_PERSIST_DIRECTORY"] = ".chromadb/"
openai.api_key = "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"

# --- Define Paths for Two Parquet Files ---
file_paths = [
    "kpmg_india/kpmg_final_concatenated_insights_gzip.parquet",  # update with the actual path for the first file
    "pwc_india/pwc_final_concatenated_insights_gzip.parquet"  # update with the actual path for the second file
]

# --- Task 1: Load and Concatenate the Parquet Files ---
dfs = [pd.read_parquet(fp) for fp in file_paths]
df = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(df)} documents from {len(file_paths)} files.")

# --- Task 2: Define the Text Chunking Function ---
def chunk_text(text, chunk_size=8000, overlap=500):
    """
    Splits the input text into chunks of chunk_size characters,
    with each chunk overlapping the previous one by 'overlap' characters.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        if end >= len(text):
            break
        start = end - overlap  # move back by overlap for next chunk
    return chunks

# --- Task 3: Initialize Chroma DB Client ---
# # Instantiate the Chroma client without extra keyword arguments.
client = chromadb.Client()
collection_name = "pwc_kpmg_insights_collection"

# Try to retrieve the collection. If it doesn't exist, create a new one.
try:
    collection = client.get_collection(collection_name)
except Exception as e:
    print(f"Collection '{collection_name}' not found, creating a new one.")
    collection = client.create_collection(name=collection_name)


# --- Task 4: Process Each Document to Create and Store Embeddings ---
embedding_ids = []
embedding_vectors = []
metadatas = []
documents = []
embedding_count = 0  # ordinal counter for embeddings

# Iterate over each row (document) in the DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing documents"):
    # Get the text from "normalized_concatenated_text"
    text = row["normalized_concatenated_text"]
    
    # Retrieve metadata fields. If there is no "source" column, fallback to "Link".
    source = row.get("source", row.get("link", ""))
    title = row["title"]
    date = row["date"]
    
    # Chunk the text into segments of 8000 characters with 500 overlap
    chunks = chunk_text(text, chunk_size=8000, overlap=500)
    
    for i, chunk in enumerate(chunks):
        embedding_count += 1
        # Generate embedding using OpenAI's API
        response = openai.Embedding.create(
            model="text-embedding-ada-002",
            input=chunk
        )
        embedding_vector = response["data"][0]["embedding"]
        
        # Create a unique embedding ID combining the document index and chunk index
        embedding_id = f"{index}_{i}"
        
        # Build metadata for this embedding
        metadata = {
            "source": source,
            "title": title,
            "date": date,
            "embedding_number": embedding_count
        }
        
        # Append results for later bulk insertion
        embedding_ids.append(embedding_id)
        embedding_vectors.append(embedding_vector)
        metadatas.append(metadata)
        documents.append(chunk)

# --- Task 5: Store the Embeddings in Chroma DB ---
collection.add(
    ids=embedding_ids,
    embeddings=embedding_vectors,
    metadatas=metadatas,
    documents=documents
)

print(f"Stored {len(embedding_ids)} embeddings in Chroma DB.")


Loaded 17 documents from 2 files.
Collection 'pwc_kpmg_insights_collection' not found, creating a new one.


Processing documents: 100%|██████████| 17/17 [00:57<00:00,  3.39s/it]


Stored 118 embeddings in Chroma DB.


In [5]:
import json
import numpy as np

# Get all items from the collection
results = collection.get(
    include=["embeddings", "metadatas", "documents"]
)

def convert_ndarray(obj):
    """Recursively convert NumPy ndarrays to lists."""
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, list):
        return [convert_ndarray(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: convert_ndarray(val) for key, val in obj.items()}
    else:
        return obj

# Convert the results to be JSON serializable.
results_serializable = convert_ndarray(results)

# Now, export to JSON
with open("exported_collection.json", "w") as f:
    json.dump(results_serializable, f, indent=2)

print("Export complete to exported_collection.json")


Export complete to exported_collection.json
