In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle

In [2]:
# Load Data
loader = UnstructuredFileLoader("files/state_of_the_union.txt")
raw_documents = loader.load()
len(raw_documents)

1

In [18]:
# Split text
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
len(documents)

41

In [19]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']
openai.api_base = os.environ['OPENAI_API_BASE']
openai.api_type = os.environ['OPENAI_API_TYPE']
openai.api_version = "2023-06-01-preview" # # this version is required for annotations

model = os.environ['CHAT_MODEL_NAME']
embedding_model = os.environ['EMBEDDING_MODEL_NAME']

In [20]:
# Load Data to vectorstore
embeddings = OpenAIEmbeddings(chunk_size=1)
vectorstore = FAISS.from_documents(documents, embeddings)

In [7]:
# Save vectorstore
# with open("vectorstore.pkl", "wb") as f:
#     pickle.dump(vectorstore, f)

In [21]:
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA


from langchain.chat_models import AzureChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

In [28]:
chat = AzureChatOpenAI(temperature=0,
    max_tokens=500,
    openai_api_base=openai.api_base,
    openai_api_version=openai.api_version,
    deployment_name=model,
    openai_api_key=openai.api_key,
    openai_api_type = openai.api_type    
)

retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5})

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=chat,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",
    verbose=False
)


In [23]:
query = "What did the president say about Ketanji Brown Jackson?"
result = qa_chain({"question": query})
result["answer"]

"The President nominated Circuit Court of Appeals Judge Ketanji Brown Jackson to serve on the United States Supreme Court. He described her as one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and a consensus builder. He also mentioned that she comes from a family of public school educators and police officers and has received broad support since her nomination."

In [24]:
query = "Did he mention who she suceeded?"
result = qa_chain({"question": query})
result["answer"]

"The President did not mention Ketanji Brown Jackson's predecessor."

In [25]:
query = "Are you sure you are right?"
result = qa_chain({"question": query})
result["answer"]

"I am certain that the President did not mention Ketanji Brown Jackson's predecessor in the given context."

K = 3 is missing the context for some questions. Let's bump the number of retrived chunks (k) to 5.

Or, we could also increase chunk_size as an alternative.

In [30]:
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5})

from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=chat,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",
    verbose=False
)

In [29]:
print("Chat with your docs!")
while True:
    query = input()
    if query in ["quit", "exit"]:
        break
    result = qa_chain({"question": query})
    print("User:")
    print(query)
    print("AI:")
    print(result["answer"])

Chat with your docs!
User:
What did the president say about Ketanji Brown Jackson?
AI:
The President nominated Circuit Court of Appeals Judge Ketanji Brown Jackson to serve on the United States Supreme Court, and he praised her as one of our nation’s top legal minds who will continue Justice Breyer’s legacy of excellence. He also mentioned that she is a former top litigator in private practice, a former federal public defender, and a consensus builder who has received a broad range of support since her nomination.
User:
Did he mention who she succeeded?
AI:
The President mentioned Justice Stephen Breyer as the predecessor of Ketanji Brown Jackson.
