In [None]:
import os
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

In [None]:
def create_chroma_vectorstore_from_folder(folder_path, collection_name="document_collection"):
    """
    Create a Chroma vector store from text files in a specified folder.
    
    Args:
        folder_path (str): Path to the folder containing text files
        collection_name (str, optional): Name of the Chroma collection. Defaults to "document_collection"
    
    Returns:
        Chroma: A Chroma vector store with embedded documents
    """
    if not os.path.exists(folder_path):
        raise ValueError(f"Folder path {folder_path} does not exist.")
    
    embeddings = OpenAIEmbeddings()
    
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            try:
                loader = TextLoader(file_path, encoding='utf-8')
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    if not documents:
        raise ValueError("No text documents found in the specified folder.")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=0
    )
    splits = text_splitter.split_documents(documents)
    

    vectorstore = Chroma.from_documents(
        documents=splits, 
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory="./chroma_db"
    )
    
    return vectorstore

In [None]:
folder_path = "data" # Path to the folder containing text files
create_chroma_vectorstore_from_folder(folder_path)

In [None]:
from langchain.agents import tool
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


@tool
def query_data(input_string: str):
    """Use this tool to query knowldge base to answer questions about courses."""
    chroma_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    model = ChatOpenAI(model="gpt-4o-mini", streaming=True)

    chroma_db = Chroma(
        persist_directory="./chroma_db",
        embedding_function=chroma_embeddings,
        collection_name="spec",
    )
    retriever = chroma_db.as_retriever(search_kwargs={'k': 4})


    template = """You are given a question and some extracted parts from several documentaion that can be used to answer the question.
    Give complete detailed answer.

    ==========
    Question: {question}
    =========
    {context}
    =========
    """
    prompt = ChatPromptTemplate.from_template(template)

    

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )

    answer = chain.invoke(input_string)
    return (answer) 