# RAG for querying scientific paper
- data from ScienceDirect "laser cladding erosion" 2014-2024 review paper 
- OpenAI LLM

In [1]:
# openai api key
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

Chat_model = ChatOpenAI(openai_api_key = api_key)

# another example
# Chat_model = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) 


In [2]:
# Load pdfs in folder
# from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader

#from langchain.text_splitter import CharacterTextSplitter 
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from langchain_experimental.text_splitter import SemanticChunker


# Create a PyPDFDirectoryLoader
#pdf_folder = 'C:\\ZZfolder\\Research Projects\\SAP17PS23\\reference\\ScienceDirect[laser cladding erosion_2014-2024_review]\\'
pdf_folder = '.\\samplepdf\\'
loader = PyPDFDirectoryLoader(pdf_folder)

#docs = loader.load()

# alternatively can load pdf with text splitter
# Create a custom text splitter

'''
text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len, 
)

'''

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 0,
    length_function = len,
)

# Load and split the documents
documents = loader.load_and_split(text_splitter)



In [10]:
# From github EnkrateiaLucca/summarizing_and_querying_multiple_pdfs_with_langchain
'''
from langchain.indexes import VectorstoreIndexCreator


index = VectorstoreIndexCreator().from_loaders([loader])

query = "What is the best material for improving erosion resistance?"

index.query(query)
'''


'\nfrom langchain.indexes import VectorstoreIndexCreator\n\n\nindex = VectorstoreIndexCreator().from_loaders([loader])\n\nquery = "What is the best material for improving erosion resistance?"\n\nindex.query(query)\n'

In [4]:
# embedding and vector store

from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma


# embedding = OpenAIEmbeddings(openai_api_key = api_key)  # (openai_api_key = api_key)
persist_directory = 'db'
vectorstore = Chroma.from_documents(documents, OpenAIEmbeddings(), persist_directory=persist_directory)

vectorstore.persist() #save the vectorstore to chromadb

#vectorstore = Chroma(persist_directory=persist_dirctory, OpenAIEmbeddings())

retriever = vectorstore.as_retriever()


In [4]:
# Similarity search
#query = 'How to improve erosion resistance?'
#docs = vectorstore.similarity_search(query)

#len(docs)
 

In [None]:
# example prompt

''' 
You are an expert researcher and writer, tasked with answering any question.
Generate a comprehensive and informative, yet concise answer of 250 words or less for the given question 
based solely on the provided search results (URL and content). 
You must only use information from the provided search results. Use an unbiased and journalistic tone. 
Combine search results together into a coherent answer. Do not repeat text. 
Cite search results using [${number}] notation. Only cite the most relevant results that answer the question accurately.
Place these citations at the end of the sentence or paragraph that reference them - do not put them all at the end. 
If different results refer to different entities within the same name, write separate answers for each entity. 
If you want to cite multiple results for the same sentence, format it as `[${number1}] [${number2}]`. 
However, you should NEVER do this with the same number - if you want to cite `number1` multiple times for a sentence, 
only do `[${number1}]` not `[${number1}] [${number1}]`\n\n
You should use bullet points in your answer for readability. 
Put citations where they apply rather than putting them all at the end.\n\n
If there is nothing in the context relevant to the question at hand, just say \"Hmm, I'm not sure.\" 
Don't try to make up an answer.\n\n
Anything between the following `context` html blocks is retrieved from a knowledge bank, 
not part of the conversation with the user.\n\n<context>\n    


'''

In [4]:
# prompt
from langchain.prompts import ChatPromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nQuestion: {question} \nContext: {context} \nAnswer:\n"))]


In [5]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# chat model: Chat_model = ChatOpenAI(openai_api_key = api_key)
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | Chat_model
    | StrOutputParser() 
)

rag_chain.invoke('What alloy was mixed with 718 for in situ alloying?')

'The alloy mixed with 718 for in situ alloying was AlCoCrFeNi high entropy alloy.'