### Importing libraries and defining API key 

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
import os
import openai
import sys
os.environ['OPENAI_API_KEY'] = dummy_key     #mention your API key
sys.path.append('../..')
openai.api_key  = os.environ['OPENAI_API_KEY']

### Loading the PDF files

In [None]:
#training the chatbot on 4 research pdfs 

loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("Downloads/political bias.pdf"),
    PyPDFLoader("Downloads/evidence based trustworthiness by dan roth.pdf"),
    PyPDFLoader("Downloads/yi zhang data provenance.pdf"),
    PyPDFLoader("Downloads/trustworthy social bias measurement.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

### Splitting the text into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
splits = text_splitter.split_documents(docs)
len(splits)

In [None]:
splits[0]

### Storing the data in a vector database

In [None]:
from langchain.llms import OpenAI
embedding = OpenAIEmbeddings()                      #embedding the textual input data

In [None]:
persist_directory = 'docs/chroma/'

In [None]:
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=embedding,
#     persist_directory=persist_directory
# )
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "explain the bias measurement framework."
# question = "what did you learn about bias?"
# question = "what are the biases known to you?."

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
docs[0].page_content

In [None]:
docs_mmr=vectordb.max_marginal_relevance_search(question,k=3, fetch_k=2)  # retrieving the documents based on mmr search. 

In [None]:
docs_mmr[0]

#### Though here the output is same, there are certain cases where mmr would give better results than a similarity search 

### Retrieval

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The research paper the chunk is, should be one of 'Downloads/political bias.pdf','Downloads/evidence based trustworthiness by dan roth.pdf','Downloads/yi zhang data provenance.pdf', `Downloads/trustworthy social bias measurement.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the research paper",
        type="integer",
    ),
]

In [None]:
document_content_description = "Research notes"
llm = OpenAI(model='gpt-3.5-turbo', temperature=0)                 #defining the model to be used and the randomness in response
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "what did they say about embedding bias in the fourth pdf?"

In [None]:
docs = retriever.get_relevant_documents(question)

In [None]:
docs[2]

### Retrieval QA

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)

In [None]:
question = "what are some bias measurements?"

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
result = qa_chain({"query": question})

In [None]:
result["result"]

### Defining a prompt for the bot

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [None]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
question = "what is divdist framework for bias measurement? Explain in detail."

In [None]:
result = qa_chain({"query": question})

In [None]:
result["result"]

In [None]:
result["source_documents"][0]

### Experimenting retrieval with different chain types

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [None]:
result = qa_chain_mr({"query": question})

In [None]:
result["result"]

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr({"query": question})
result["result"]

In [None]:
question = "Summarize the fourth pdf in detail."

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr({"query": question})
result["result"]

### Retrieval QA

In [None]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

# Run chain
from langchain.chains import RetrievalQA
question = "Summarize the research paper 'trustworthy socia bias measurement ' in detail."
qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True,
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})


result = qa_chain({"query": question})
result["result"]

### Saving the previous chats using Memory

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [None]:
question = "Summarize the research paper 'trustworthy social bias measurement ' in detail."
result = qa({"question": question})

In [None]:
result['answer']

In [None]:
question = "does divdist framework overcome the shortcomings in previous measures?"
result = qa({"question": question})

In [None]:
result['answer']