In [2]:
from langchain_community.document_loaders import PyPDFium2Loader


In [3]:
loader = PyPDFium2Loader("../data/allcott.pdf")

In [4]:
docs = loader.load()
docs



[Document(page_content='NBER WORKING PAPER SERIES\r\nPOLARIZATION AND PUBLIC HEALTH:\r\nPARTISAN DIFFERENCES IN SOCIAL DISTANCING DURING THE CORONAVIRUS PANDEMIC\r\nHunt Allcott\r\nLevi Boxell\r\nJacob C. Conway\r\nMatthew Gentzkow\r\nMichael Thaler\r\nDavid Y. Yang\r\nWorking Paper 26946\r\nhttp://www.nber.org/papers/w26946\r\nNATIONAL BUREAU OF ECONOMIC RESEARCH\r\n1050 Massachusetts Avenue\r\nCambridge, MA 02138\r\nApril 2020\r\nWe thank Victoria Pu for research assistance. We thank SafeGraph for providing access to the \r\ndata and the Safe-Graph COVID-19 response community for helpful input. We thank Lubos \r\nPastor along with seminar participants at Stanford University, Harvard University, and the \r\nUniversity of Chicago for their comments and suggestions. We acknowledge funding from the \r\nStanford Institute for Economic Policy Research (SIEPR), the John S. and James L. Knight \r\nFoundation, the Sloan Foundation, the Institute for Humane Studies, and the National Science \r

In [5]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings()

In [6]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter()


In [7]:
documents = text_splitter.split_documents(docs)
type(documents)


list

In [11]:
raw_text = ""
number_pages = 1 # we restrict to 10 pages but this could completely change
for i in range(number_pages):
    raw_text += documents[i].page_content

In [12]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [13]:
representative_schema = ResponseSchema(name="representative_sample",
                             description="Is the sample representative of the population? \
                             Answer True if yes,\
                             False if not or unknown.")
number_respondents_schema = ResponseSchema(name="number_respondents",
                                      description="How many respondents did answer the survey? If this \
                                      information is not found,\
                                      output -1.")
design_schema = ResponseSchema(name="design",
                                    description="Extract any\
                                    sentences about the design of the survey, and output them as a \
                                    comma separated Python list.")

response_schemas = [representative_schema, 
                    number_respondents_schema,
                    design_schema]

In [14]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [16]:
format_instructions = output_parser.get_format_instructions()
format_instructions

'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"representative_sample": string  // Is the sample representative of the population?                              Answer True if yes,                             False if not or unknown.\n\t"number_respondents": string  // How many respondents did answer the survey? If this                                       information is not found,                                      output -1.\n\t"design": string  // Extract any                                    sentences about the design of the survey, and output them as a                                     comma separated Python list.\n}\n```'

In [17]:
from langchain_core.prompts import ChatPromptTemplate

review_template_2 = """\
For the following text, extract the following information:

representative_sample: Is the sample representative of the population? Answer True if yes, False if not or unknown.

number_respondents: How many respondents did answer the survey? If this information is not found, output -1.

design: Extract any sentences about the design of the survey, and output them as a comma separated Python list.

text: {text}

{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template=review_template_2)

messages = prompt.format_messages(text=raw_text, 
                                format_instructions=format_instructions)

In [18]:
print(messages[0].content)

For the following text, extract the following information:

representative_sample: Is the sample representative of the population? Answer True if yes, False if not or unknown.

number_respondents: How many respondents did answer the survey? If this information is not found, output -1.

design: Extract any sentences about the design of the survey, and output them as a comma separated Python list.

text: NBER WORKING PAPER SERIES
POLARIZATION AND PUBLIC HEALTH:
PARTISAN DIFFERENCES IN SOCIAL DISTANCING DURING THE CORONAVIRUS PANDEMIC
Hunt Allcott
Levi Boxell
Jacob C. Conway
Matthew Gentzkow
Michael Thaler
David Y. Yang
Working Paper 26946
http://www.nber.org/papers/w26946
NATIONAL BUREAU OF ECONOMIC RESEARCH
1050 Massachusetts Avenue
Cambridge, MA 02138
April 2020
We thank Victoria Pu for research assistance. We thank SafeGraph for providing access to the 
data and the Safe-Graph COVID-19 response community for helpful input. We thank Lubos 
Pastor along with seminar participants at Stan

In [None]:
# from langchain.chat_models import ChatOpenAI
# chat = ChatOpenAI(temperature=0.0, model=llm_model)
# response = chat(messages)
# print(response.content)
# output_dict = output_parser.parse(response.content)
# output_dict

In [19]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama2")

In [20]:
llm.invoke(messages)

'```json\n{\n  "representative_sample": true,\n  "number_respondents": 1000,\n  "design": ["The survey was conducted using an online panel of adults in the United States.","The sample size was 1000 respondents."]\n}\n```'