### Project: Question- Answering on Private Documents

In [38]:
import os
from dotenv import load_dotenv, find_dotenv

# loading env variables from .env file
load_dotenv(find_dotenv(), override=True)

True

#### Loading Documents

In [2]:
def load_documents(file):
    """
    Load documents from a file.
    :param file: File to load
    :return: List of documents
    """
    extension = file.split(".")[-1]
    if extension == "txt":
        from langchain.document_loaders import TextLoader
        print(f"Loading text document {file}")
        loader = TextLoader(file)
    elif extension == "json":
        from langchain.document_loaders import JSONLoader
        print(f"Loading json document {file}")
        loader = JSONLoader(file)
    elif extension == "csv":
        from langchain.document_loaders import CSVLoader
        print(f"Loading csv document {file}")
        loader = CSVLoader(file)
    elif extension in ["doc","docx"]:
        from langchain.document_loaders import Docx2txtLoader
        print(f"Loading docx document {file}")
        loader = Docx2txtLoader(file)
    elif extension == "pdf":
        from langchain.document_loaders import PyPDFLoader
        print(f"Loading pdf document {file}")
        loader = PyPDFLoader(file)
    else:
        print("Document type not supported")
        return None
    data = loader.load()
    return data

def load_wikipedia(query, lang="en",load_max_docs=2):
    """
    Load wikipedia articles from the wikipedia API
    """
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query, lang, load_max_docs)
    return loader.load()

#### Chunking Data

In [4]:
def chunk_data(data, chunk_size=300):
    """
    Split data into chunks based on \n or .
    """
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    chunks = text_splitter.split_documents(data)
    return chunks

#### Calculating Cost

In [14]:
def calculating_cost(texts, model = 'text-embedding-ada-002', token_cost = 0.0004):
    """
    Calculates the cost of encoding the texts using the specified model.
    """
    import tiktoken
    enc = tiktoken.encoding_for_model(model)
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    total_cost = f'Total cost: {total_tokens /1000 * token_cost:.6f}$'
    return total_tokens, total_cost

#### Embedding and Uploading to a Vector Database (Pinecone)

In [20]:
def insert_or_fetch_embedding(index_name, chunks, model = "text-embedding-ada-002"):
    """
    Inserts or fetches a vector store from Pinecone.
    """
    import pinecone
    import time
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings(model=model)
    
    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_API_ENV"))

    if index_name in pinecone.list_indexes():
        print(f"Index {index_name} already exists, fetching...")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
    else:
        print(f"Index {index_name} does not exist, creating...")
        pinecone.create_index(index_name, dimension=1536, metric="cosine")
        while index_name not in pinecone.list_indexes():
            time.sleep(10)
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
    return vector_store

#### Delete Pinecone Index

In [27]:
def delete_pinecone_index(index_name = "all"):
    """
    Delete a pinecone index.
    """
    import pinecone
    
    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_API_ENV"))

    if index_name=="all":
        indexes = pinecone.list_indexes()
        for index in indexes:
            pinecone.delete_index(index)
        print("Deleted pinecone indexes: " + str(indexes))
    elif index_name is not None:
        pinecone.delete_index(index_name)
        print("Deleted pinecone index: " + index_name)
    else:
        print("There are no index present")

#### Asking Questions and Getting Answers

In [65]:
def ask_ques_and_get_answer(vector_store, ques):
    """
    This function takes in a vector store and a question and returns the answer.
    """
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
    retriever = vector_store.as_retriever(search_type='similarity', search_params={'k': 5})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(ques)
    
    return answer

def ask_with_memory(vector_store, ques, chat_history=[]):
    """
    This function takes in a vector store, a question, and a chat history and returns the answer with the chat history.
    """
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
    retriever = vector_store.as_retriever(search_type='similarity', search_params={'k': 3})

    chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    result = chain({'question': ques, 'chat_history': chat_history})
    chat_history.append((ques, result['answer']))
    
    return result, chat_history

#### Running Code

##### Running code from document files

In [12]:
files_path = []
directory_path = "files"
# Reading all the files from files folder
for file in os.listdir(directory_path):
    files_path.append(os.path.join(directory_path, file))

print(f"There are {len(files_path)} files in the folder")
files_path

There are 8 files in the folder


['files/attention_is_all_you_need.pdf',
 'files/react.pdf',
 'files/CDOC-110hdoc50.pdf',
 'files/state_of_the_union.txt',
 'files/sj.txt',
 'files/churchill_speech.txt',
 'files/us_constitution.pdf',
 'files/the_great_gatsby.docx']

In [33]:
import pandas as pd
stats = []
doc_chunks = []
for doc in files_path:
    data = load_documents(doc)
    chunks = chunk_data(data)
    doc_chunks.extend(chunks)
    total_tokens, total_cost = calculating_cost(chunks)
    stats.append([doc, len(chunks), total_tokens, total_cost])

Loading pdf document files/attention_is_all_you_need.pdf
Loading pdf document files/react.pdf
Loading pdf document files/CDOC-110hdoc50.pdf
Loading text document files/state_of_the_union.txt
Loading text document files/sj.txt
Loading text document files/churchill_speech.txt
Loading pdf document files/us_constitution.pdf
Loading docx document files/the_great_gatsby.docx


In [17]:
file_stats = pd.DataFrame(stats, columns=['file', 'chunks', 'tokens', 'cost'])
file_stats

Unnamed: 0,file,chunks,tokens,cost
0,files/attention_is_all_you_need.pdf,156,10232,Total cost: 0.004093$
1,files/react.pdf,428,33676,Total cost: 0.013470$
2,files/CDOC-110hdoc50.pdf,1336,68539,Total cost: 0.027416$
3,files/state_of_the_union.txt,166,8089,Total cost: 0.003236$
4,files/sj.txt,53,2766,Total cost: 0.001106$
5,files/churchill_speech.txt,84,4621,Total cost: 0.001848$
6,files/us_constitution.pdf,177,17660,Total cost: 0.007064$
7,files/the_great_gatsby.docx,1104,69326,Total cost: 0.027730$


In [39]:
# deleing all pinecone existing index
delete_pinecone_index()

Deleted pinecone indexes: ['document-index']


In [40]:
# creating an index and inserting documents into it
index_name = "document-index"
vector_store = insert_or_fetch_embedding(index_name, doc_chunks)

Index document-index does not exist, creating...


In [41]:
# Single question query
ques = 'What is the whole document about?'
answer = ask_ques_and_get_answer(vector_store, ques)
print(answer)

The document is the Constitution of the United States of America.


In [43]:
# Ask any no of questions till we quit
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_ques_and_get_answer(vector_store, q)
    print(f'Question: {q}\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

Write Quit or Exit to quit.
Question: What does US constitution states?
Answer: The United States Constitution states the purpose of forming a more perfect union, establishing justice, ensuring domestic tranquility, providing for the common defense, promoting the general welfare, and securing the blessings of liberty for ourselves and future generations. It also establishes the Constitution as the supreme law of the land, along with laws made in accordance with it and treaties of the United States. The Constitution binds judges in every state.

 -------------------------------------------------- 

Question: What are the different documents data you have?
Answer: Based on the provided context, it is not clear what specific documents or data are being referred to. The context mentions "acts, records, and proceedings," but it does not provide any specific information about the content or nature of these documents. Therefore, it is not possible to determine the different documents or data 

##### Downloading public content from Wikipedia and perform Q&A

In [44]:
# deleting all indexes
delete_pinecone_index()

Deleted pinecone indexes: ['document-index']


In [46]:
data = load_wikipedia('ChatGPT', 'en')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embedding(index_name, chunks)

Index chatgpt does not exist, creating...


In [54]:
query = "What is ChatGPT?"
answer = ask_ques_and_get_answer(vector_store, query)
answer

'ChatGPT, short for Chat Generative Pre-trained Transformer, is a chatbot developed by OpenAI. It is a language model-based chatbot that allows users to have conversations with it. It is built on the transformer architecture and has been fine-tuned specifically for conversational applications. ChatGPT was released as a research preview and is available for free, but there is also a paid version called "ChatGPT Plus" that offers additional features and priority access to newer updates.'

In [61]:
chat_history = []
query = "How many model ChatGPT is having? List down the names."
answer, chat_history = ask_with_memory(vector_store, query, chat_history)
answer

{'question': 'How many model ChatGPT is having? List down the names.',
 'chat_history': [('How many model ChatGPT is having? List down the names.',
   'ChatGPT is based on two GPT foundation models: GPT-3.5 and GPT-4. These models were fine-tuned specifically for conversational usage to create the chatbot product.')],
 'answer': 'ChatGPT is based on two GPT foundation models: GPT-3.5 and GPT-4. These models were fine-tuned specifically for conversational usage to create the chatbot product.'}

In [62]:
chat_history = []
query = "How many model ChatGPT is having? List down the names."
answer, chat_history = ask_with_memory(vector_store, query, chat_history)
answer

[('How many model ChatGPT is having? List down the names.',
  'ChatGPT is based on two GPT foundation models: GPT-3.5 and GPT-4. These models were fine-tuned specifically for conversational usage to create the chatbot product.')]

In [66]:
question = 'Give me the model names from the above answer'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

The model names mentioned in the previous answer are GPT-1, GPT-2, GPT-3, GPT-3.5, and GPT-4.
[('How many model ChatGPT is having? List down the names.', 'ChatGPT is based on two GPT foundation models: GPT-3.5 and GPT-4. These models were fine-tuned specifically for conversational usage to create the chatbot product.'), ('Multiply the model count by 2', 'The information provided does not mention the "model count" specifically. Therefore, I do not have the necessary context to determine the result of multiplying the model count by 2.'), ('Multiply the number count by 2', "I'm sorry, but I don't have enough information to answer your question."), ('Give me the model names from the above answer', 'The model names mentioned in the previous answer are GPT-1, GPT-2, GPT-3, GPT-3.5, and GPT-4.')]


#### Ask with Memory Loop


In [None]:
import time
i = 1

chat_history = []

print("Write Quit or Exit to quit")
while True:
    q = input(f"Question #{i}")
    i = i + 1
    if q.lower() in ["quit","exit"]:
        print("Quitting")
        time.sleep(2)
        break
    result, _ = ask_with_memory(vector_store, q, chat_history)
    print (result['answer'])
    print("----------------------------------------------------------------------")