In [18]:
import pandas as pd
import numpy as np
import os
import argparse

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain.prompts import ChatPromptTemplate

from dataclasses import dataclass
import os
import shutil

In [15]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"


def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")




In [16]:
generate_data_store()

Split 2 documents into 30 chunks.
Mary High School to join their basketball team in 1999. Overall, James scored 2,657 points, 892 rebounds and 523 assists during his four years there.
{'source': 'data\\test_doc2.txt', 'start_index': 637}
Saved 30 chunks to chroma.


  warn_deprecated(


In [21]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

#parser = argparse.ArgumentParser()
#parser.add_argument("query_text", type=str, help="The query text.")
#args = parser.parse_args()
#query_text = args.query_text
query_text = "Who is Lebron James?"

# Prepare the DB.
embedding_function = OpenAIEmbeddings()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Search the DB.
results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
    


In [22]:
results

[(Document(page_content='Who Is LeBron James? LeBron James is an American basketball player with the Los Angeles Lakers. James first garnered national attention as the top high school basketball player in the country. With his unique combination of size, athleticism and court vision, he became a four-time NBA MVP. After', metadata={'source': 'data\\test_doc2.txt', 'start_index': 0}),
  0.8754595713487021),
 (Document(page_content='his unique combination of size, athleticism and court vision, he became a four-time NBA MVP. After leading the Miami Heat to titles in 2012 and 2013, James returned to Cleveland and helped the franchise claim its first championship in 2016.', metadata={'source': 'data\\test_doc2.txt', 'start_index': 198}),
  0.7912372140746119),
 (Document(page_content="School team also earned the top national ranking that year. James would soon emerge as one of the National Basketball Association's leading players.", metadata={'source': 'data\\test_doc2.txt', 'start_index': 

In [23]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

model = ChatOpenAI(model="gpt-3.5-turbo-0125",  
                   api_key= os.environ["OPENAI_API_KEY"], 
                   )

response_text = model.predict(prompt)

sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\nSources: {sources}"
print(formatted_response)

Human: 
Answer the question based only on the following context:

Who Is LeBron James? LeBron James is an American basketball player with the Los Angeles Lakers. James first garnered national attention as the top high school basketball player in the country. With his unique combination of size, athleticism and court vision, he became a four-time NBA MVP. After

---

his unique combination of size, athleticism and court vision, he became a four-time NBA MVP. After leading the Miami Heat to titles in 2012 and 2013, James returned to Cleveland and helped the franchise claim its first championship in 2016.

---

School team also earned the top national ranking that year. James would soon emerge as one of the National Basketball Association's leading players.

---

Answer the question based on the above context: Who is Lebron James?



  warn_deprecated(
  warn_deprecated(


Response: LeBron James is an American basketball player who currently plays for the Los Angeles Lakers. He is known for his size, athleticism, and court vision, and has been a four-time NBA MVP. He has won NBA titles with the Miami Heat in 2012 and 2013, and with the Cleveland Cavaliers in 2016.
Sources: ['data\\test_doc2.txt', 'data\\test_doc2.txt', 'data\\test_doc2.txt']


In [24]:
response_text

'LeBron James is an American basketball player who currently plays for the Los Angeles Lakers. He is known for his size, athleticism, and court vision, and has been a four-time NBA MVP. He has won NBA titles with the Miami Heat in 2012 and 2013, and with the Cleveland Cavaliers in 2016.'