In [44]:
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings
import json
from langchain_core.messages import SystemMessage, HumanMessage
import os
from dotenv import load_dotenv

In [45]:
# local_llm = 'llama3.2:3b-instruct-fp16'
# llm = ChatOllama(model=local_llm, temperature=0)
# llm_json_mode = ChatOllama(model=local_llm, temperature=0, format='json')
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0,api_key=os.environ["OPENAI_API_KEY"])
llm_json_mode = llm

In [46]:
load_dotenv(dotenv_path=".env", verbose=True)

True

In [47]:
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]="NLP_Project_rag"

In [48]:
urls = [
    "https://aws.amazon.com/agreement/",
    "https://aws.amazon.com/service-terms/"
]

docs = []
for url in urls:
    docs.append(WebBaseLoader(url).load())

docs_list = []
for doc in docs:
    for sub_list in doc:
        docs_list.append(sub_list)

print(docs)
print(docs_list)



In [49]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 1000,
    chunk_overlap = 200
)

doc_splits = text_splitter.split_documents(docs_list)

In [50]:
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1", nomic_api_key=os.environ["NOMIC_EMBED_API_KEY"])
)

retriever = vectorstore.as_retriever(k=3)

In [51]:
doc_grader_instructions = """You are a grader assessing relevance of a retrieved document to a user question.

If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

This carefully and objectively assess whether the document contains at least some information that is relevant to the question.

Return JSON with single key, binary_score, that is 'yes' or 'no' score to indicate whether the document contains at least some information that is relevant to the question."""

question = "What are AWS responsibilities described in AWS customer agreement?"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
doc_grader_prompt_formatted = doc_grader_prompt.format(document=doc_txt, question=question)
result = llm_json_mode.invoke([SystemMessage(content=doc_grader_instructions)] + [HumanMessage(content=doc_grader_prompt_formatted)])
json.loads(result.content)

{'binary_score': 'yes'}

In [52]:
print(docs[1])

page_content='1. AWS Responsibilities

1.1 General. You may access and use the Services in accordance with this Agreement. Service Level Agreements and Service Terms apply to certain Services.
1.2 Third-Party Content. Third-Party Content may be used by you at your election. Third-Party Content is governed by this Agreement and, if applicable, separate terms and conditions accompanying such Third-Party Content, which terms and conditions may include separate fees and charges. 


1.3 AWS Security. Without limiting Section 8 or your obligations under Section 2.2, we will implement reasonable and appropriate measures designed to help you secure Your Content against accidental or unlawful loss, access or disclosure.
1.4 Data Privacy. You may specify the AWS regions in which Your Content will be stored. You consent to the storage of Your Content in, and transfer of Your Content into, the AWS regions you select. We will not access or use Your Content except as necessary to maintain or provide

In [53]:

rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

docs = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)

AWS responsibilities described in the AWS customer agreement include providing access to the services in accordance with the agreement, implementing security measures to help secure Your Content, and storing Your Content in the AWS regions selected by you. Additionally, AWS will give notice of any changes to the services, including discontinuing material functionalities, with at least 12 months' prior notice.


In [54]:
hallucination_grader_instructions = """

You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:

(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 

(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Score:

A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

hallucination_grader_prompt = """FACTS: \n\n {documents} \n\n STUDENT ANSWER: {generation}. 

Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER is grounded in the FACTS. And a key, explanation, that contains an explanation of the score."""

hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(documents=docs_txt, generation=generation.content)
result = llm_json_mode.invoke([SystemMessage(content=hallucination_grader_instructions)] + [HumanMessage(content=hallucination_grader_prompt_formatted)])
json.loads(result.content)

{'binary_score': 'yes',
 'explanation': "The STUDENT ANSWER is grounded in the FACTS provided. The answer accurately reflects AWS responsibilities as outlined in the AWS Customer Agreement, including providing access to services, implementing security measures for Your Content, storing content in selected regions, and giving notice of changes to services with at least 12 months' prior notice."}