In [4]:
import sys
from pathlib import Path
from dotenv import load_dotenv

# Set up directory paths
notebook_path = Path().resolve()  # Path to the current notebook directory
experiments_path = notebook_path.parent # Path to the experiments directory where the utils module is located
backend_path = experiments_path.parent / "backend"

if experiments_path not in sys.path:
    sys.path.insert(0, str(experiments_path))

# Load environment variables from backend/.env if it exists;
# otherwise, load system environment variables with default behavior
env_path = backend_path / ".env"
if env_path.exists():
    load_dotenv(env_path, override=True)
    print(f"[OK] Loaded environment variables from {env_path}")
else:
    load_dotenv(override=True)
    print("[WARNING] Backend .env not found, using default environment")


[OK] Loaded environment variables from C:\Users\smvan\repos\madetech-rag-assistant\backend\.env


In [5]:
# --- Local Utilities ---
from utils.handbook_loader import load_handbook_documents

# Load all handbook documents from the backend data/handbook directory
all_documents = load_handbook_documents(backend_path / "data" / "handbook")

In [7]:
# --- Environment and API ---
import openai

# --- Tokenization ---
import tiktoken

# --- Numerical and Data Science ---
import numpy as np
from sklearn.manifold import TSNE

# --- Database ---
from chromadb import PersistentClient

# --- Visualization ---
import plotly.graph_objects as go

In [8]:
# How many characters in all the documents?
print(f"Found {len(all_documents)} files in the knowledge base")

entire_knowledge_base = ""

for doc in all_documents:
    entire_knowledge_base += doc.content

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}")

Found 161 files in the knowledge base
Total characters in knowledge base: 637,427


In [9]:
# How many tokens in all the documents?
MODEL = 'gpt-4o-mini'

encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
print(f"Total tokens in knowledge base: {len(tokens):,}")

Total tokens in knowledge base: 126,711


In [10]:
# Creation of Vector Database
DB_NAME = "experiments_db"
collection_name = "docs"

chroma = PersistentClient(path=DB_NAME)

if collection_name in [c.name for c in chroma.list_collections()]:
    chroma.delete_collection(collection_name)
    
collection = chroma.get_or_create_collection(name=collection_name)

In [11]:

# Extract the 'content' field from each loaded handbook document, resulting in a list of all document texts
all_documents_content = [doc.content for doc in all_documents]

# Send all document contents to the OpenAI API's text-embedding-3-large model to generate embeddings for each document.
# The API returns a batch of embeddings, one per document.
embeddings = openai.embeddings.create(
    model="text-embedding-3-large",
    input=all_documents_content
)

# Extract the actual embeddings data from the API response
embeddings_data = [e.embedding for e in embeddings.data]


In [12]:
ids = [doc.id for doc in all_documents]
metadatas = [{'category': doc.category, 'title': doc.title} for doc in all_documents]

collection.add(
    ids=ids,
    documents=all_documents_content,
    embeddings=embeddings_data,
    metadatas=metadatas
)

collection.count()

161

In [13]:
# Prework
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['category'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange', 'purple', 'brown'][['Roles', 'Guides', 'Company', 'Communities Of Practice', 'Benefits', 'Team Norms'].index(t)] for t in doc_types]

In [14]:
# Let's try 3D!
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Legend mapping (category to color)
category_color_map = {
    'Roles': 'blue',
    'Guides': 'green',
    'Company': 'red',
    'Communities Of Practice': 'orange',
    'Benefits': 'purple',
    'Team Norms': 'brown'
}

# Create scatter for each category for proper legend handling
scatter_traces = []
for category, color in category_color_map.items():
    inds = [i for i, t in enumerate(doc_types) if t == category]
    if inds:
        scatter_traces.append(
            go.Scatter3d(
                x=reduced_vectors[inds, 0],
                y=reduced_vectors[inds, 1],
                z=reduced_vectors[inds, 2],
                mode='markers',
                marker=dict(size=5, color=color, opacity=0.8),
                name=category,
                text=[f"Type: {doc_types[i]}<br>Text: {documents[i][:100]}..." for i in inds],
                hoverinfo='text'
            )
        )

fig = go.Figure(data=scatter_traces)

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    legend=dict(title="Category"),
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()