In [119]:
import os
import glob
import tiktoken
import numpy as np
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
load_dotenv(override=True)

openai_api = os.getenv("OPENAI_API_KEY")

if openai_api is None:
    raise ValueError("OpenAI api key is not found")

In [None]:
MODEL = 'gpt-4.1-mini'
db_name = "vector_db"

In [None]:
knowledge_base_path = "knowledge-base/**/*md"

all_files = glob.glob(knowledge_base_path, recursive=True)
print(f"total number of files in knowledge base -> {len(all_files)}")

entire_knowledge_base = ""
for file in all_files:
    with open(file, "r", encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "/n/n"

print(f"total chars in the knowledge base is {len(entire_knowledge_base):,}")

In [None]:
encoding = tiktoken.encoding_for_model(MODEL)
tokens = encoding.encode(entire_knowledge_base)
token_count = len(tokens)

print(f"total tokens for {MODEL}: {token_count:,}")

In [None]:
folder_root = "knowledge-base/*"
knowledge_folders = glob.glob(folder_root, recursive=True)

documents = []

for folder in knowledge_folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(
        folder, 
        "**/*.md", 
        loader_cls=TextLoader, 
        loader_kwargs={"encoding": "utf-8"}
    )

    loaded_documents = loader.load()

    for document in loaded_documents:
        document.metadata["doc_type"] = doc_type
        documents.append(document)


print(f"total document in context is {len(documents)}")

In [None]:
documents[2]

In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents=documents)

print(f"divided into {len(chunks)} chunks...")
print(f"first chunk \n\n{chunks[0]}")


NameError: name 'RecursiveCharacterTextSplitter' is not defined

In [None]:
#embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vector_store = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"vector store created with {vector_store._collection.count()} documents")

In [115]:
# investigating vectors

collection = vector_store._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"there are {count:,} vectors with {dimensions:,} dimensions in vector store")

there are 413 vectors with 3,072 dimensions in vector store


In [116]:
result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result["embeddings"])
documents = result["documents"]
metadatas = result["metadatas"]
doc_types = [metadata["doc_type"] for metadata in metadatas]
colors = [
    ["blue", "green", "red", "orange"][
        ["products", "employees", "contracts", "company"].index(t)
    ]
    for t in doc_types
]

In [117]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(
    data=[
        go.Scatter(
            x=reduced_vectors[:, 0],
            y=reduced_vectors[:, 1],
            mode="markers",
            marker=dict(size=5, color=colors, opacity=0.8),
            text=[
                f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)
            ],
            hoverinfo="text",
        )
    ]
)


fig.update_layout(
    title="2D Chroma Vector Store Visualization",
    xaxis_title="t-SNE Component 1",  # More descriptive title
    yaxis_title="t-SNE Component 2",  # More descriptive title
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40),
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [118]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(
    data=[
        go.Scatter3d(
            x=reduced_vectors[:, 0],
            y=reduced_vectors[:, 1],
            z=reduced_vectors[:, 2],
            mode="markers",
            marker=dict(size=5, color=colors, opacity=0.8),
            text=[
                f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)
            ],
            hoverinfo="text",
        )
    ]
)

fig.update_layout(
    title="3D Chroma Vector Store Visualization",
    scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40),
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed