### QueryMyDoc.ipynb
Q&A your own documents using LangChain, GPT-4 and Chroma vector DB
    
[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rashlab/AI-Notes/blob/main/langchain/QueryMyDoc.ipynb)
    


In [None]:
!pip install chromadb
!pip install langchain

In [14]:
import os
import requests
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator

os.environ["OPENAI_API_KEY"] = " Your API KEY HERE "

In [17]:
# download the text file we will use (state_of_the_union.txt)
input_file = 'state_of_the_union.txt'
if not os.path.exists(input_file):
    data_url = 'https://github.com/rashlab/AI-Notes/blob/main/langchain/state_of_the_union.txt'
    with open(input_file, 'w') as f:
        f.write(requests.get(data_url).text)

loader = TextLoader('state_of_the_union.txt')

documents = loader.load()

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

# that's creating the index. Then, we expose this index in a retriever interface
retriever = db.as_retriever()
llm_35Turbo = OpenAI(model_name="gpt-3.5-turbo", temperature=0.0)
llm_davinci003 = OpenAI(model_name="text-davinci-003", temperature=0.0)
qa_35Turbo = RetrievalQA.from_chain_type(llm_35Turbo, chain_type="stuff", retriever=retriever)
qa_davinci003 = RetrievalQA.from_chain_type(llm_davinci003, chain_type="stuff", retriever=retriever)
qaWithSource_35Turbo = RetrievalQAWithSourcesChain.from_chain_type(llm_35Turbo, chain_type="stuff", retriever=retriever)
qaWithSource_davinci003 = RetrievalQAWithSourcesChain.from_chain_type(llm_davinci003, chain_type="stuff", retriever=retriever)

query = "should we raise the corporate tax?"

Using embedded DuckDB without persistence: data will be transient


In [18]:

llm_davinci003 = OpenAI(model_name="text-davinci-003", temperature=0.0)
llm_35Turbo = OpenAI(model_name="gpt-3.5-turbo", temperature=0.0)
qa_davinci003 = RetrievalQA.from_chain_type(llm_davinci003, chain_type="stuff", retriever=retriever)
qa_35Turbo = RetrievalQA.from_chain_type(llm_35Turbo, chain_type="stuff", retriever=retriever)
qaWithSource_davinci003 = RetrievalQAWithSourcesChain.from_chain_type(llm_davinci003, chain_type="stuff", retriever=retriever)
qaWithSource_35Turbo = RetrievalQAWithSourcesChain.from_chain_type(llm_35Turbo, chain_type="stuff", retriever=retriever)


query = "should we raise the corporate tax?"

In [20]:
qa_davinci003.run(query)

" Yes, I've proposed a 15% minimum tax rate for corporations."

In [19]:
qa_35Turbo.run(query)

'The context suggests that the speaker, President Biden, believes that corporations and the wealthiest Americans should start paying their fair share of taxes, and has proposed a 15% minimum tax rate for corporations. However, it is not explicitly stated whether or not the corporate tax should be raised.'

In [22]:
qaWithSource_davinci003({"question": query}, return_only_outputs=False)

{'question': 'should we raise the corporate tax?',
 'answer': ' The president did not mention raising the corporate tax.\n',
 'sources': 'state_of_the_union.txt'}

In [23]:
qaWithSource_35Turbo({"question": query}, return_only_outputs=False)

{'question': 'should we raise the corporate tax?',
 'answer': 'The president proposes to raise the corporate tax to ensure that corporations and the wealthiest Americans start paying their fair share. He has proposed a 15% minimum tax rate for corporations and closing loopholes so the very wealthy don’t pay a lower tax rate than a teacher or a firefighter. However, he has also stated that under his plan, nobody earning less than $400,000 a year will pay an additional penny in new taxes. \n',
 'sources': 'state_of_the_union.txt'}