In [1]:
######################################
### QUERY PDFs WITH LANGCHAIN & OPENAI
######################################

"""
OBJECTIVE:
- Explore options information retrival from text-based documents
- Status: SUCCESS!
    - the LANGCHAIN and OPENAI's GPT model was able to answer correctly a wide number of questions from the document
    - it can be used to automate summarization of standardized documents
        - dates, names, entities, key topics, sentiments, summarizations,...
        - possibly, we can make assesement if a set of documents provide support or not for some case of interest
        
BASED ON:
1.) https://www.udemy.com/course/langchain-guide-next-gen-chatgpt-llms-apps-with-langchain/learn/lecture/38797056#overview
2.) https://towardsdatascience.com/4-ways-of-question-answering-in-langchain-188c6707cc5a

TESTS:
1.) LLM's HALLUCINATION... provide question without the input document... and you still get an answer!?!??!

2.) VECTOR SEARCH
- document needs to be EMBEDED and put into a VECTOR DATABASE (ElasticVectorSearch, Pinecone, Weaviate, FAISS)
- this allows to perform similarity search between embeddings of the docuemnt and that of the query.
- test shows ability of the GPT model to answer a wide range of questions
- it returns "no information available" if question asks about element not present in the document
- we can also ask model to SUMMARIZE and asses SENTIMENT of the document with the PROMPT!!!

3.) CHAIN-TYPES
- key issue: base approach works with all data which can breach the input-rate-limits (1000-characters). 
    - thus you need to batch the input text into smaller chunks manually or use chain-types that do it automatically
- available chain types: "staff", "map_reduce", "refine", "map-rerank"
    - "staff" by default uses the whole document; all other chunk the data automatically
        - detailes of each chain-type are BELOW
- map_reduce seems to be the stronges possibly because it does use the whole information directly... although in chunks
    - other approaches respondend not only with names of the authors of the paper but also with authors found in CITATIONS!?!!?
    
4.) RetrievalQA
- A way to address the issue of working with ALL data. 
- it does initial seach for the most relevant chunks of the data!!! 
- and only those chunks are sent to OpenAI!!!
- provided correct answer
- two parameters that control the quality of answers
    - search_type="mmr", "similarity"
    - search_kwargs={"k":2} # controls number of text segments to be exteacted and sent to OpenAI
        - "similarity" required K=5 to produce consistently the correct answer
        - "mmr" required K=2!!! - cheaper and faster to run!!!
    
5.) VectorstoreIndexCreator
- wrapper on the above (4)
- requires specific data loader which i had issues to install on my comp.
- TBC...

6.) ConversationalRetrievalChain
- Like RetrievalQA, but allows to provide in the query the history of the discussion
- it was able to distribuish between the author of the paper and the authors listed in CITATIONS!!!
- Unfortunatelly, the example data does not provide interesting test case thus:
- TBC...

"""

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp39-cp39-macosx_11_0_arm64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m565.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [None]:
#####################
### ENVIRONMENT SETUP
#####################

#!pip install langchain
#!pip install openai
#!pip install PyPDF2
#!pip install faiss-cpu # faiss-cpu, faiss-gpu
#!pip install tiktoken

# Get your API keys from openai, you will need to create an account. 
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview

#import os
#os.environ["OPENAI_API_KEY"] = ''

In [9]:
#################
### LOAD PACKAGES
#################

from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

from langchain.chains import LLMChain

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

**Without data access**

In [104]:
###########################
### LLM's HALLUCINATIONs!!! -- even though no doc was provided the model still generated a response!!!
###########################

template = """ {question}"""

prompt_template = PromptTemplate(input_variables=["question"], 
                                 template=template)
llm = OpenAI(temperature=0.0)
chain = LLMChain(llm=llm, prompt=prompt_template)

chain.run("who are the authors of the article")

'\n\nThe authors of the article are Dr. David A. Sousa and Dr. Thomas R. Guskey.'

**With access to data**

In [98]:
####################
### DATA PREPARATION
####################

reader = PdfReader('2023_GPT4All_Technical_Report.pdf')
print(reader)

# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text
        
print("raw_text\n\n", raw_text)
print("raw_text[:100]\n\n", raw_text[:100])

# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. 

text_splitter = CharacterTextSplitter(separator = "\n",
                                      chunk_size = 1000,
                                      chunk_overlap  = 200,
                                      length_function = len)

texts = text_splitter.split_text(raw_text)

print(len(texts), texts[0], texts[1])

<PyPDF2._reader.PdfReader object at 0x1773e2be0>
raw_text

 GPT4All: Training an Assistant-style Chatbot with Large Scale Data
Distillation from GPT-3.5-Turbo
Yuvanesh Anand
yuvanesh@nomic.aiZach Nussbaum
zanussbaum@gmail.com
Brandon Duderstadt
brandon@nomic.aiBenjamin Schmidt
ben@nomic.aiAndriy Mulyar
andriy@nomic.ai
Abstract
This preliminary technical report describes the
development of GPT4All, a chatbot trained
over a massive curated corpus of assistant in-
teractions including word problems, story de-
scriptions, multi-turn dialogue, and code. We
openly release the collected data, data cura-
tion procedure, training code, and final model
weights to promote open research and repro-
ducibility. Additionally, we release quantized
4-bit versions of the model allowing virtually
anyone to run the model on CPU.
1 Data Collection and Curation
We collected roughly one million prompt-
response pairs using the GPT-3.5-Turbo OpenAI
API between March 20, 2023 and March 26th,
2023. To do this, 

In [99]:
###################################
### SET EMBEDDINGS & VECTOR DATABSE
###################################

embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)

In [100]:
#################
### VECTOR SEARCH
#################

chain = load_qa_chain(OpenAI(), chain_type="stuff")

query = "who are the authors of the article?"

docs = docsearch.similarity_search(query)

"""
for i in docs:
    print(i,"\n\n")
    
print(docs[0], type(docs[0]), type(docs))
"""

#chain.run(input_documents=docs, question=query) # use ALL chunks
chain.run(input_documents=[docs[3]], question=query) # use ONLY chunk with the information -- for some reason it was the last one>?!?!?!?

queries = ["What was the cost of training the GPT4all model?",
           "How was the model trained?",
           "what was the size of the training dataset?",
           "How is this different from other models?",
           "What is Google Bard?",
           "Summarize the document content",
           "What is the sentiment of the document?",
           "Based on this paper, why should people avoid using AI?"]

for query in queries:
    docs = docsearch.similarity_search(query)
    result = chain.run(input_documents=docs, question=query)
    print("Query:", query, "\n", "Result:", result, "\n\n")
    time.sleep(20) # due to the query rate limits in OpenAI -- makes clearer output

' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'

In [102]:
###############
### CHAIN-TYPES
###############

# USE ALTERNATIVE DATA LODER -- this one does data-chunking automatically!

#! pip install pypdf
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("2023_GPT4All_Technical_Report.pdf")
#loader = PdfReader("2023_GPT4All_Technical_Report.pdf") # EACH LOADER HAS DIFFERENT BEHAVIOUR AND SET OF ASSOCiATED FUCNTIONS
documents = loader.load() # IT LOADs AND GROUPS INTO CHUNKS!!!

# CHECK THE DOCs
for i in documents:
    print(i, "\n\n")

page_content='GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.aiZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.aiBenjamin Schmidt\nben@nomic.aiAndriy Mulyar\nandriy@nomic.ai\nAbstract\nThis preliminary technical report describes the\ndevelopment of GPT4All, a chatbot trained\nover a massive curated corpus of assistant in-\nteractions including word problems, story de-\nscriptions, multi-turn dialogue, and code. We\nopenly release the collected data, data cura-\ntion procedure, training code, and final model\nweights to promote open research and repro-\nducibility. Additionally, we release quantized\n4-bit versions of the model allowing virtually\nanyone to run the model on CPU.\n1 Data Collection and Curation\nWe collected roughly one million prompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first gathered a d

In [47]:
##############################
### CHAIN TYPES - DESCRIPTION:
##############################

"""
-- GENRAL ISSUE: all methods ultimately use all data... possibly chanked but ALL

chain_type="stuff" <<DEFAULT>>:
- uses ALL of the text from the documents in the prompt. 
- Might run into the MAX_LENGTH of the prompt error 
 
"map_reduce":
- It separates texts into batches (as an example, you can define batch size in llm=OpenAI(batch_size=5)),
- feeds each batch with the question to LLM separately - final answer based on the answers from each batch.

"refine":
It separates texts into batches, feeds the first batch to LLM, and feeds the answer and the second batch to LLM.
It refines the answer by going through all the batches.

"map-rerank":
It separates texts into batches, feeds each batch to LLM, returns a score of how fully it answers the question,
and comes up with the final answer based on the high-scored answers from each batch.
"""

'\nchain_type="stuff" <<DEFAULT>> \n- uses ALL of the text from the documents in the prompt. \n- Might run into the MAX_LENGTH of the prompt error \n \n"map_reduce: It separates texts into batches (as an example, you can define batch size in llm=OpenAI(batch_size=5)), feeds each batch with the question to LLM separately, and comes up with the final answer based on the answers from each batch.\nrefine : It separates texts into batches, feeds the first batch to LLM, and feeds the answer and the second batch to LLM. It refines the answer by going through all the batches.\nmap-rerank: It separates texts into batches, feeds each batch to LLM, returns a score of how fully it answers the question, and comes up with the final answer based on the high-scored answers from each batch.\n'

In [59]:
##############################
### CHAIN TYPES - COMPARISONs:
##############################

#chain_types = ["stuff", "map_reduce", "refine", "map-rerank"] # "stuff" -- LIMIT ERROR!!!
chain_types = ["map_reduce", "refine", "map-rerank"]

from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
import time

for chain_type in chain_types:
    try:
        print(chain_type)
        chain = load_qa_chain(llm=OpenAI(), chain_type=chain_type)
        query = "who are the authors of the article?"
        response = chain.run(input_documents = documents, question = query)
        print("response", chain_type, ":", response)
    except:
        print("SOME ERROR -- Possibly Rate Limit")
        pass
    time.sleep(60)      

# THIS LISTS ALL PEOPLE IN THE TEXT allso those in the citations!!!

map_reduce
response map_reduce :  The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.
refine
response refine : 

The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, Andriy Mulyar, Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen, Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto, Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth ´ee Lacroix, Baptiste Rozi `ere, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample, Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Hannaneh Hajishirzi.
map-rerank
SOME ERROR -- Possibly Rate Limit


In [None]:
######################################
### RESULTS of CHAIN_TYPES comparison:
######################################

"""
response EXPECTED:
- "Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar"

response "staff":
- works with all data in the document thus runs into input-length-limit

response "map_reduce": -- listed also CITATIONS in the first run but not in the subsequent ones?!?!?!?!?!?
- The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.

response "refine": -- lists also CITATIONS!!!.. also in subsequent ones...
- The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, Andriy Mulyar...
- , Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen, Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto, Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth ´ee Lacroix, Baptiste Rozi `ere, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample, Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Hannaneh Hajishirzi.

rsponse "map-rerank": -- lists also CITATIONS!!!.. also in subsequent ones...
- The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, Andriy Mulyar
- , Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto, Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth ée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample, Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Hannaneh Hajishirzi.
"""


In [None]:
##########################
### For multiple documents
##########################

loaders = [....]
documents = []
for loader in loaders:
    documents.extend(loader.load())

In [75]:
###############
### RetrievalQA
###############

"""
- this addresses the issue of working with ALL data. 
- it does initial seach for the most relevant chunks of the data!!! 
- and only those chunks are sent to OpenAI!!!
"""
#!pip install chromadb

from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
#db = Chroma.from_documents(texts, embeddings)
db = FAISS.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k":2}) #"mmr", "similarity"
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                 chain_type="stuff", 
                                 retriever=retriever, 
                                 return_source_documents=False) # True
query = "who are the authors of the article?"
result = qa({"query": query})
display(result)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-g0rg9a7HAzehrPHro1S6kMLk on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-g0rg9a7HAzehrPHro1S6kMLk on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

{'query': 'who are the authors of the article?',
 'result': ' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'}

In [70]:
#######################
### RetrievalQA RESULTS
#######################

# "search_type="similarity", search_kwargs={"k":2} -- WEAK - shows only soem citations not the authors of the paper!!!

"""
{'query': 'who are the authors of the article?',
 'result': ' The authors of the article are Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto, Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth´ee Lacroix, Baptiste Rozi`ere, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample.'}
 """

# search_type="similarity", search_kwargs={"k":5} -- CORRECT RESULTS!!!

{'query': 'who are the authors of the article?',
 'result': ' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'}


# search_type="mmr", search_kwargs={"k":2} -- CORRECT RESULTS!!!

{'query': 'who are the authors of the article?',
 'result': ' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'}

In [91]:
#####################################
### Method 3: VectorstoreIndexCreator
#####################################

# REQUIRES CHROMADB, which i cannot install on my MAC!!!!!
#! pip install chromadb, pydantic-settings

from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator().from_loaders([loader])
#index = VectorstoreIndexCreator().from_documents([docsearch])
query = "who are the authors of the article?"
index.query(llm=OpenAI(), chain_type="stuff", question=query)

ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [97]:
################################
### ConversationalRetrievalChain
################################

from langchain.chains import ConversationalRetrievalChain

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index
#db = Chroma.from_documents(texts, embeddings)
db = FAISS.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k":2}) #"mmr", "similarity"
# create a chain to answer questions 
qa = ConversationalRetrievalChain.from_llm(OpenAI(), retriever)

chat_history = []
query = "who are the authors of the article?"
result = qa({"question": query, "chat_history": chat_history})
display(result)

chat_history = [(query, result["answer"])]
query = "who are the authors of the citations in the article?"
result = qa({"question": query, "chat_history": chat_history})
display(result)

{'question': 'who are the authors of the article?',
 'chat_history': [],
 'answer': ' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.'}

{'question': 'who are the authors of the citations in the article?',
 'chat_history': [('who are the authors of the article?',
   ' The authors of the article are Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, and Andriy Mulyar.')],
 'answer': ' Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, Tatsunori B. Hashimoto, Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample, and Yizhong Wang, Yeganeh Kordi, Swaroop Mishra, Alisa Liu, Noah A. Smith, Daniel Khashabi, and Hannaneh Hajishirzi.'}

In [None]:
print(embeddings, 
embeddings.Config(), "\n\n",
embeddings.allowed_special, "\n\n",
embeddings.chunk_size, "\n\n",
embeddings.disallowed_special, "\n\n",
embeddings.embed_documents, "\n\n",
embeddings.embed_query, "\n\n",
embeddings.embedding_ctx_length, "\n\n",
embeddings.headers, "\n\n",
embeddings.model, "\n\n",
embeddings.model_kwargs, "\n\n",
embeddings.schema(), "\n\n",
embeddings.schema_json(), "\n\n",
embeddings.tiktoken_model_name, "\n\n",
embeddings.validate, "\n\n",
embeddings.validate_environment, "\n\n",
     )

print(dir(embeddings))

print(docsearch, docsearch.docstore, dir(docsearch))

print(dir(docsearch.docstore), docsearch.docstore.search("who is the author?"))