# RAG on OPEN AI Model

In [1]:
# Import libraries
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import GrobidParser

In [2]:
#Load data from Grobid
loader = GenericLoader.from_filesystem(
    "/Users/Scott/Downloads/test input/",
    glob="*",
    suffixes=[".pdf"],
    parser=GrobidParser(segment_sentences=False),
)

data = loader.load()

In [3]:
#Checking metadata
data[1].metadata

{'text': 'In this paper, we introduce Dynamically Rewired Message Passing (DRew), a novel framework for layer-dependent, multi-hop message passing that takes a principled approach to information flow, is robust to over-squashing, and can be applied to any MPNN for deep learning on graphs.',
 'para': '0',
 'bboxes': "[[{'page': '1', 'x': '307.44', 'y': '303.89', 'h': '234.00', 'w': '9.03'}, {'page': '1', 'x': '307.44', 'y': '315.85', 'h': '235.25', 'w': '9.03'}, {'page': '1', 'x': '307.44', 'y': '328.19', 'h': '234.00', 'w': '8.64'}, {'page': '1', 'x': '307.44', 'y': '340.15', 'h': '234.00', 'w': '8.64'}, {'page': '1', 'x': '307.44', 'y': '352.10', 'h': '202.11', 'w': '8.64'}]]",
 'pages': "('1', '1')",
 'section_title': 'Introduction',
 'section_number': '1.',
 'paper_title': 'DRew: Dynamically Rewired Message Passing with Delay',
 'file_path': '/Users/Scott/Downloads/test input/2305.08018v2.pdf'}

## Preparing the documents and vector database

In [4]:
# Import libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.openai import OpenAIEmbeddings
import os
from langchain_chroma import Chroma

In [None]:
openai_api_key =  #removed for privacy

In [7]:
#Split the document using RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 100) # type: ignore
docs = splitter.split_documents(data) 

#Embed the documents in a persistent Chroma vector Database
embedding_function = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = Chroma.from_documents(
    docs,
    embedding=embedding_function,
    persist_directory=os.getcwd()
)


#Configure the vectore sotre as a retriever 
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)

  warn_deprecated(


## Building a retrieval prompt template

In [8]:
#Import Libraries
from langchain_core.prompts import ChatPromptTemplate

In [9]:
# Add placeholders to the message string
message = """
Answer the following question using the context provided:

Context:
{context}

Question:
{question}

Answer:
"""

# Create a chat prompt template from the message string
prompt_template = ChatPromptTemplate.from_messages([("human", message)])

## Creating a RAG chain

In [10]:
#Import Libraries
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

In [13]:
from langchain_core.output_parsers import StrOutputParser

openai_api_key = openai_api_key

#Store Documents to be available to retrieval
vectorstore = Chroma.from_documents(
    docs,
    embedding=OpenAIEmbeddings(openai_api_key=openai_api_key),
    persist_directory=os.getcwd()
)

retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5}
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Chain to link retriever, prompt_template, and llm
rag_chain = ({"context": retriever, "question": RunnablePassthrough()}
            | prompt_template
            | llm)

# Invoking chain and printing response
response = rag_chain.invoke("How many hours of computation were performed on the Tesla V100-SXM2-32GB hardware?")
print(response.content)

46400 hours of computation were performed on the Tesla V100-SXM2-32GB hardware.


In [124]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks specifically about the provided PDF documents."
    "Use the following pieces of retrieved context to answer "
    "the question. Do not use any external knowledge "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "Create 10 scientific question and answer pairs  related to the PDFs"})
print(response["answer"])

1. **Question:** What was the carbon efficiency of the private infrastructure used in the experiments?
**Answer:** The carbon efficiency of the private infrastructure was 0.432 kgCO2 eq/kWh.

2. **Question:** How many hours of computation were performed on the Tesla V100-SXM2-32GB hardware?
**Answer:** A cumulative of 46400 hours of computation was performed on the Tesla V100-SXM2-32GB hardware.

3. **Question:** What was the total estimated emissions in kgCO2 eq for the experiments conducted?
**Answer:** The total estimated emissions for the experiments were 6013.44 kgCO2 eq.

4. **Question:** Why is there overlap between the pre-training and fine-tuning small molecule datasets?
**Answer:** There is overlap between the datasets because under a certain number of heavy atoms, nearly all possible molecules can be enumerated, and many datasets draw from this distribution.

5. **Question:** What are the differences between the molecules in ANI-1x and QM9 datasets?
**Answer:** While there i