In [2]:
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain


# Loading the PDF

In [3]:
loader = PyPDFium2Loader("../data/allcott.pdf")

In [4]:
docs = loader.load()



# Storing the PDF content in a vector database for future retreival

In [5]:
embeddings = OllamaEmbeddings()

In [6]:
text_splitter = RecursiveCharacterTextSplitter()


In [7]:
documents = text_splitter.split_documents(docs)
type(documents)

number_pages = 10 #we only consider the 10 first pages
mini_documents = documents[0:number_pages]

In [8]:
vector = FAISS.from_documents(mini_documents, embeddings)

# Prompt the LLM using a Retreival chain

In [15]:
llm = Ollama(model="llama2")

In [16]:
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [17]:
retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# Ask your questions

In [18]:
response = retrieval_chain.invoke({"input": "could you give me more details about the survey authors conducted?"})
print(response["answer"])

Of course! The survey authors conducted a nationally representative survey of US adults to investigate partisan differences in beliefs and behaviors related to COVID-19. Here are some additional details about the survey:

1. Sample size: The survey was conducted among a sample of 2,000 US adults aged 18 or older.
2. Data collection method: The survey was administered online using Amazon's Mechanical Turk platform.
3. Survey duration: The survey took approximately 20-30 minutes to complete.
4. Data collection period: The survey was conducted between March 27th and April 10th, 2020.
5. Response rate: The response rate for the survey was approximately 70%.
6. Demographic characteristics: The sample was weighted to be representative of the US adult population based on demographic characteristics such as age, gender, race, and education level.
7. Questionnaire design: The survey included a mix of questions about beliefs and behaviors related to COVID-19, including:
	* Beliefs about the seve

In [None]:
# next step : ask more precise questions and some output format in JSON to get same formatting across all papers

# string where we take all text contents and apply output parser (see file only_with_string.ipynb, but does not work that well yet)