In [None]:
import os
import glob
from google import genai
from google.genai import types
from dotenv import load_dotenv
import gradio as gr

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import time
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb

In [None]:
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
gemini = genai.Client(api_key=GEMINI_API_KEY)

In [None]:
MODEL = "gemini-2.0-flash"
db_name = "vector_db"
EMBEDDING_MODEL = "models/gemini-embedding-exp-03-07"

In [None]:
folders = glob.glob("./notebooks/knowledge-base-mini/*")
text_loader_kwargs = {"encoding": "utf-8"}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(
        folder,
        glob="**/*.md",
        loader_cls=TextLoader,
        loader_kwargs=text_loader_kwargs,
    )
    folder_docs = loader.load()

    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)



In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
doc_types = set(chunk.metadata["doc_type"] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")
len(chunks)

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(
    google_api_key=GEMINI_API_KEY, model=EMBEDDING_MODEL
)

open_ai_embeddings = OpenAIEmbeddings(max_retries=3, timeout=30)

In [None]:
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [None]:
vector_store = Chroma.from_documents(
    documents=documents[0: min(len(documents), 5)], embedding=embeddings, persist_directory=db_name
)

In [None]:
vector_store._collection.count()

In [None]:
collection = vector_store._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vector has {dimensions} dimensions.")

In [None]:
result = collection.get(include=["embeddings", "documents", "metadatas"])
vectors = np.array(result["embeddings"])
res_documents = result["documents"]
doc_types_other = [metadata["doc_type"] for metadata in result["metadatas"]]
colors = [
    ["blue", "green", "red", "orange"][
        ["products", "employees", "contracts", "company"].index(t)
    ]
    for t in doc_types
]

In [None]:
tsne = TSNE(n_components=2, random_state=40, perplexity=5)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(
    data=[
        go.Scatter(
            x=reduced_vectors[:, 0],
            y=reduced_vectors[:, 1],
            mode="markers",
            marker=dict(size=5, color=colors, opacity=0.8),
            text=[
                f"Type: {t}<br>Text: {d[:100]}..."
                for t, d in zip(doc_types, res_documents)
            ],
            hoverinfo="text",
        )
    ]
)

fig.update_layout(
    title="2D Chroma Vector Store Visualization",
    scene=dict(xaxis_title="x", yaxis_title="y"),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40),
)

fig.show()