# QA using a Retriever
https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa

In [2]:
###### chromadb is not compatible with latest pydantic and fastapi
# pydantic.errors.PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.3/migration/#basesettings-has-moved-to-pydantic-settings for more details.
# !pip install chromadb
# !pip install fastapi==0.99.1
# !pip install pydantic==1.10.0
# NOTE: you have to restart jupyter kernel to reload lib

!pip install tiktoken  # for OpenAIEmbeddings.

Collecting tiktoken
  Obtaining dependency information for tiktoken from https://files.pythonhosted.org/packages/f4/2e/0adf6e264b996e263b1c57cad6560ffd5492a69beb9fd779ed0463d486bc/tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.5.1


## Example 1: Basic QA with OpenAI
https://python.langchain.com/docs/use_cases/question_answering/

1. Load: Specify a DocumentLoader to load in your unstructured data as Documents. A Document is a piece of text (the page_content) and associated metadata.
   There are many [Loaders](https://integrations.langchain.com/)
   `documents = TextLoader(....).load()`  
2. Split: Split the Document into chunks for embedding and vector storage.  
   `texts = CharacterTextSplitter(...).split_documents(documents)`
3. Store: To be able to look up our document splits, we first need to store them where we can later look them up.  
   The most common way to do this is to embed the contents of each document then store the embedding and document in a vector store,  
   with the embedding being used to index the document.  
   `vectorstore = FAISS.from_documents(texts, embeddings)`

You can find the resources [here](https://integrations.langchain.com/)
loader: such as TextLoader
embeddings: such as OpenAIEmbeddings, HuggingFaceEmbeddings
vectorstore: such as FAISS and Chromadb



In [7]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS

loader = TextLoader(os.path.abspath("./mydata/state_of_the_union.txt"), encoding="utf-8")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
openai_api_key=""
with open('./mydata/openai_api_key.txt', 'r') as file:
    openai_api_key = file.read().strip()

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
# docsearch = Chroma.from_documents(texts, embeddings) # Chromadb is bugged
vectorstore = FAISS.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key), chain_type="stuff", retriever=vectorstore.as_retriever())

In [8]:
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

" The President said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."