In [1]:
import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

# may be very similar to RAG
# credits to this code : https://github.com/GoogleCloudPlatform/generative-ai/blob/main/language/use-cases/document-qa/question_answering_documents_langchain.ipynb

In [2]:
llm = Ollama(model="llama2")
embeddings = OllamaEmbeddings()

# Load the PDF

In [4]:
pdf_loader = PyPDFLoader("../data/allcott.pdf")
pages = pdf_loader.load_and_split()
print(pages[3].page_content)

Our main GPS results show that the strong partisan differences in social distancing behavior
that emerged with the rise of COVID-19 are not merely an artifact of differences in public policies
or observed risks. Controlling for state-time ﬁxed effects to account for heterogenous policy re-
sponses by state governments only attenuates the partisan gap slightly. Including controls to proxy
for local policy, health, weather, and economic variables interacted ﬂexibly with time attenuates
the gap more substantially, but it remains statistically and economically signiﬁcant. After including
our full set of controls, we estimate that moving from the 10th to the 90th percentile of Republican
county vote share is associated with 11.5 and 15.2 percent increases in the number of POI visits
during the weeks of April 6 and May 11, when social distancing and partisan gaps are at their
respective peaks.
Our ﬁndings are robust to the inclusion or exclusion of control variables, excluding states with
ea

# Define your prompt

In [5]:
question = "Could you give me information about the sample?"
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {question} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

## Q&A without similarity search (i.e. without vector database)

In [6]:
len_context = 7 
len_input = 3
context = "\n".join(str(p.page_content) for p in pages[:len_context])
print("The total words in the context: ", len(context))

The total words in the context:  17527


## Stuffing

In [7]:
stuff_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

In [8]:
stuff_answer = stuff_chain(
    {"input_documents": pages[len_context:len_context+len_input], "question": question}, return_only_outputs=True
)

  warn_deprecated(


In [9]:
stuff_answer

{'output_text': 'Answer not available in context. The question is asking for information about the sample used in the analysis, but the provided context does not provide enough information to answer the question. Please provide more context or clarify the question.'}

## Map reduce

In [10]:
question_prompt_template = """
                    Answer the question as precise as possible using the provided context. \n\n
                    Context: \n {context} \n
                    Question: \n {question} \n
                    Answer:
                    """
question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

combine_prompt_template = """Given the extracted content and the question, create a final answer.
If the answer is not contained in the context, say "answer not available in context. \n\n
Summaries: \n {summaries}?\n
Question: \n {question} \n
Answer:
"""
combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)

In [11]:
map_reduce_chain = load_qa_chain(
    llm,
    chain_type="map_reduce",
    return_intermediate_steps=True,
    question_prompt=question_prompt,
    combine_prompt=combine_prompt,
)

In [12]:
map_reduce_outputs = map_reduce_chain({"input_documents": pages, "question": question})

# Q&A with similarity search (see also code first_tests)

In [None]:
vector_index = Chroma.from_documents(pages, embeddings).as_retriever()


In [None]:
docs = vector_index.get_relevant_documents(question)

In [None]:
map_reduce_embeddings_outputs = map_reduce_chain(
    {"input_documents": docs, "question": question}
)

In [None]:
print(map_reduce_embeddings_outputs["output_text"])