In [None]:
import os
import glob
from google import genai
from dotenv import load_dotenv
import gradio as gr

In [None]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [None]:
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
gemini = genai.Client(api_key=GEMINI_API_KEY)

In [None]:
MODEL = "gemini-2.0-flash"
db_name = "vector_db"

In [None]:
folders = glob.glob("./notebooks/knowledge-base/*")
text_loader_kwargs = {"encoding": "utf-8"}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()

    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
doc_types = set(chunk.metadata["doc_type"] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

In [None]:
for chunk in chunks:
    if "CEO" in chunk.page_content:
        print(chunk)
        print("_________")