In [1]:
from mlc_llm.embeddings.embeddings import MLCEmbeddings

mlc_embeddings = MLCEmbeddings(
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC",
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/libs/snowflake-arctic-embed-m-q0f32-metal.so",
    device="metal:0",
    # debug_dir="/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/debug",
)

[2024-05-02 10:42:52] INFO auto_device.py:79: [92mFound[0m device: metal:0
[2024-05-02 10:42:52] INFO chat_module.py:379: Using model folder: /Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC
[2024-05-02 10:42:52] INFO chat_module.py:380: Using mlc chat config: /Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/dist/snowflake-arctic-embed-m-q0f32-MLC/mlc-chat-config.json


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Chroma

from typing import List

import numpy as np


class ArcticEmbeddings(Embeddings):
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        parsed_texts = ["[CLS]" + text + "[SEP]" for text in texts]
        embed_tokens = mlc_embeddings.embed(parsed_texts).numpy()[:, 0]
        embed_tokens = embed_tokens / np.linalg.norm(embed_tokens, axis=1, keepdims=True)
        return embed_tokens.tolist()

    def embed_query(self, text: str) -> List[float]:
        parsed_text = (
            "[CLS] Represent this sentence for searching relevant passages: " + text + "[SEP]"
        )
        embed_tokens = mlc_embeddings.embed([parsed_text]).numpy()[:, 0]
        embed_tokens = embed_tokens / np.linalg.norm(embed_tokens, axis=1, keepdims=True)
        return embed_tokens.tolist()[0]

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
from mlc_llm import MLCEngine

# Create engine
local_llm = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"
mlc_engine = MLCEngine(local_llm)

from typing import Any, List, Optional

from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM


class MLCLlama3(LLM):
    max_tokens: int = 40

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        response = mlc_engine.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=local_llm,
            stream=False,
            max_tokens=self.max_tokens,
            temperature=0.7,
            top_p=0.95
        )
        return response.choices[0].message.content

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model. Used for logging purposes only."""
        return "mlc_llama3"

[2024-05-02 10:43:03] INFO auto_device.py:88: [91mNot found[0m device: cuda:0
[2024-05-02 10:43:04] INFO auto_device.py:88: [91mNot found[0m device: rocm:0
[2024-05-02 10:43:05] INFO auto_device.py:88: [91mNot found[0m device: vulkan:0
[2024-05-02 10:43:06] INFO auto_device.py:88: [91mNot found[0m device: opencl:0
[2024-05-02 10:43:06] INFO auto_device.py:35: Using device: [1mmetal:0[0m
  from .autonotebook import tqdm as notebook_tqdm
[2024-05-02 10:43:06] INFO chat_module.py:362: Downloading model from HuggingFace: HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC
[2024-05-02 10:43:06] INFO download.py:133: Weights already downloaded: [1m/Users/cfruan/.cache/mlc_llm/model_weights/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC[0m
[2024-05-02 10:43:06] INFO jit.py:35: [1mMLC_JIT_POLICY[0m = ON. Can be one of: ON, OFF, REDO, READONLY
[2024-05-02 10:43:06] INFO jit.py:121: Using cached model lib: [1m/Users/cfruan/.cache/mlc_llm/model_lib/15474f53a4abf8ef6d1c828ee551bcf0.so[0m
[2024-05-

In [5]:
local_llama3 = MLCLlama3(max_tokens=100)
response = local_llama3("What is the meaning of life?")
print(response)

  warn_deprecated(


What a profound and timeless question! The meaning of life is a topic that has been debated and explored by philosophers, scientists, spiritual leaders, and many others for centuries. While there is no one definitive answer, I'd be happy to share some insights and perspectives that might be helpful.

For some, the meaning of life is deeply personal and subjective. It may be related to their values, beliefs, and experiences. For example, someone might find meaning in their relationships, their work, their creative pursuits


In [6]:
import json

with open("./ori_pqal.json", "r") as f:
    pubmedqa_dataset = json.load(f)

with open("./all_contexts.txt", "+wt") as f:
    for pmid in pubmedqa_dataset:
        contexts = pubmedqa_dataset[pmid]["CONTEXTS"]
        for context in contexts:
            f.write(context + "\n")

chroma_client = Chroma(
    "pubmedqa_rag",
    ArcticEmbeddings(),
    "/Users/cfruan/Documents/mlc-llm-repos/mlc-llm-head/rag",
)

with open("./all_contexts.txt", "r") as f:
    pubmedqa_doc = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
doc_splits = text_splitter.split_text(pubmedqa_doc)
print("len(doc_splits): ", len(doc_splits))

chunk_size = 20
for i in range(0, len(doc_splits), chunk_size):
    print("Adding documents {} to {}".format(i, i + chunk_size))
    chroma_client.add_texts(doc_splits[i : i + chunk_size])
retriever = chroma_client.as_retriever()

[2024-05-02 00:19:11] INFO posthog.py:20: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


len(doc_splits):  3918
Adding documents 0 to 20
Adding documents 20 to 40
Adding documents 40 to 60
Adding documents 60 to 80
Adding documents 80 to 100
Adding documents 100 to 120
Adding documents 120 to 140
Adding documents 140 to 160
Adding documents 160 to 180
Adding documents 180 to 200
Adding documents 200 to 220
Adding documents 220 to 240
Adding documents 240 to 260
Adding documents 260 to 280
Adding documents 280 to 300
Adding documents 300 to 320
Adding documents 320 to 340
Adding documents 340 to 360
Adding documents 360 to 380
Adding documents 380 to 400
Adding documents 400 to 420
Adding documents 420 to 440
Adding documents 440 to 460
Adding documents 460 to 480
Adding documents 480 to 500
Adding documents 500 to 520
Adding documents 520 to 540
Adding documents 540 to 560
Adding documents 560 to 580
Adding documents 580 to 600
Adding documents 600 to 620
Adding documents 620 to 640
Adding documents 640 to 660
Adding documents 660 to 680
Adding documents 680 to 700
Adding 

In [9]:
### Retrieval Grader

from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance 
    of a retrieved document to a user question. If the document contains keywords related to the user question, 
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.
     <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "document"],
)

retrieval_grader = prompt | local_llama3 | JsonOutputParser()
question = "A short stay or 23-hour ward in a general and academic children's hospital: are they effective?"
docs = retriever.invoke(question)
print(docs)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

[Document(page_content="We evaluated the usefulness of a short stay or 23-hour ward in a pediatric unit of a large teaching hospital, Westmead Hospital, and an academic Children's hospital, The New Children's Hospital, to determine if they are a useful addition to the emergency service."), Document(page_content="compared to 20.5 hours (New Children's Hospital). The users of the short stay ward were children of young age less than 2 years, with stay greater than 23 hours reported in only 1% of all admissions to the short stay ward. The rate of patient admission to an in-hospital bed was low, (4% [Westmead Hospital] compared to 6% [New Children's Hospital]), with the number of unscheduled visits within 72 hours of short stay ward discharge less than 1%. There were no adverse events reported at either"), Document(page_content="The short stay ward accounted for 10.3% (Westmead Hospital) and 14.7% (New Children's Hospital) of admissions, with 56% medical in nature, 30% surgical, and the rem

In [11]:
### Generate
import json

from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

from typing import List

from langchain_core.output_parsers.transform import BaseTransformOutputParser


def find_first_occurrence(s):
    # List the substrings to search for
    substrings = ["yes", "no", "maybe"]

    # Variable to store the position of the first occurrence and the corresponding substring
    first_pos = float("inf")
    result = "maybe"

    # Check each substring in the input string
    for substring in substrings:
        pos = s.find(substring)
        if pos != -1 and pos < first_pos:
            first_pos = pos
            result = substring

    return result


class GenerationParser(BaseTransformOutputParser[str]):
    @property
    def _type(self) -> str:
        """Return the output parser type for serialization."""
        return "default"

    def parse(self, text: str) -> str:
        if not text.endswith("}"):
            text = text + "}"

        try:
            json_res = json.loads(text)
            if "answer" in json_res and 'explanation' in json_res:
                return text
            else:
                answer = find_first_occurrence(text)
                explanation = text
                return json.dumps({"answer": answer, "explanation": explanation})
        except:
            answer = find_first_occurrence(text)
            explanation = text
            return json.dumps({"answer": answer, "explanation": explanation})


# Prompt
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question.
    Provide the answer as a complete JSON with two keys. The answer should not have preamble.
    The first key is 'answer' of value 'yes', 'no' or 'maybe'.
    The second key is 'explanation' that provides a brief explanation of the answer within 20 words.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {question} 
    Context: {context} 
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question", "document"],
)

# Chain
rag_chain = prompt | local_llama3 | GenerationParser() | JsonOutputParser()

# Run
question = "A short stay or 23-hour ward in a general and academic children's hospital: are they effective?"
docs = retriever.invoke(question)
print("Retrieved docs:", docs)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

Retrieved docs: [Document(page_content="We evaluated the usefulness of a short stay or 23-hour ward in a pediatric unit of a large teaching hospital, Westmead Hospital, and an academic Children's hospital, The New Children's Hospital, to determine if they are a useful addition to the emergency service."), Document(page_content="compared to 20.5 hours (New Children's Hospital). The users of the short stay ward were children of young age less than 2 years, with stay greater than 23 hours reported in only 1% of all admissions to the short stay ward. The rate of patient admission to an in-hospital bed was low, (4% [Westmead Hospital] compared to 6% [New Children's Hospital]), with the number of unscheduled visits within 72 hours of short stay ward discharge less than 1%. There were no adverse events reported at either"), Document(page_content="The short stay ward accounted for 10.3% (Westmead Hospital) and 14.7% (New Children's Hospital) of admissions, with 56% medical in nature, 30% surgi

In [12]:
### Hallucination Grader

# Prompt
prompt = PromptTemplate(
    template=""" <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing whether 
    an answer is grounded in / supported by a set of facts. Give a binary 'yes' or 'no' score to indicate 
    whether the answer is grounded in / supported by a set of facts. Provide the binary score as a JSON with a 
    single key 'score' and no preamble or explanation. <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here are the facts:
    \n ------- \n
    {documents} 
    \n ------- \n
    Here is the answer: {generation}  <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["generation", "documents"],
)

hallucination_grader = prompt | local_llama3 | JsonOutputParser()
hallucination_grader.invoke(
    {"documents": docs, "generation": generation["explanation"]}
)

{'score': 'yes'}

In [13]:
### Answer Grader

# Prompt
prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing whether an 
    answer is useful to resolve a question. Give a binary score 'yes' or 'no' to indicate whether the answer is 
    useful to resolve a question. Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
     <|eot_id|><|start_header_id|>user<|end_header_id|> Here is the answer:
    \n ------- \n
    {generation} 
    \n ------- \n
    Here is the question: {question} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["generation", "question"],
)

answer_grader = prompt | local_llama3 | JsonOutputParser()
answer_grader.invoke({"question": question, "generation": generation})

{'score': 'yes'}

In [14]:
### Router

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an expert at routing a 
    user question to a vectorstore or web search. Use the vectorstore for questions on PubMed. 
    You do not need to be stringent with the keywords in the question related to these topics. 
    Otherwise, use web-search. Give a binary choice 'web_search' 
    or 'vectorstore' based on the question. Return the a JSON with a single key 'datasource' and 
    no premable or explaination. Question to route: {question} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question"],
)

question_router = prompt | local_llama3 | JsonOutputParser()
question = "A short stay or 23-hour ward in a general and academic children's hospital: are they effective?"
docs = retriever.get_relevant_documents(question)
print(docs[1].page_content)
doc_txt = docs[1].page_content
print(question_router.invoke({"question": question}))

  warn_deprecated(


compared to 20.5 hours (New Children's Hospital). The users of the short stay ward were children of young age less than 2 years, with stay greater than 23 hours reported in only 1% of all admissions to the short stay ward. The rate of patient admission to an in-hospital bed was low, (4% [Westmead Hospital] compared to 6% [New Children's Hospital]), with the number of unscheduled visits within 72 hours of short stay ward discharge less than 1%. There were no adverse events reported at either
{'datasource': 'vectorstore'}


In [15]:
### Search
import json
from langchain_community.tools import BraveSearch

web_search_tool = BraveSearch.from_api_key(
    api_key=os.getenv("BRAVE_API_KEY"), search_kwargs={"count": 3}
)
docs = web_search_tool.invoke(
    {"query": "Syncope during bathing in infants, a pediatric form of water-induced urticaria?"}
)
print(json.loads(docs)[0]["snippet"])

&quot;Aquagenic maladies&quot; could be <strong>a</strong> <strong>pediatric</strong> <strong>form</strong> <strong>of</strong> the aquagenic <strong>urticaria</strong>.


In [16]:
from typing_extensions import TypedDict
from typing import List


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        generation: LLM generation
        web_search: whether to add search
        documents: list of documents
    """

    question: str
    generation: str
    web_search: str
    documents: List[str]


from langchain.schema import Document

### Nodes


def retrieve(state):
    print("---RETRIEVE---")
    question = state["question"]

    # Retrieval
    documents = retriever.invoke(question)
    return {"documents": documents, "question": question}


def generate(state):
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    # RAG generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation}


def grade_documents(state):
    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    web_search = "No"
    for d in documents:
        score = retrieval_grader.invoke({"question": question, "document": d.page_content})
        grade = score["score"]
        # Document relevant
        if grade.lower() == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        # Document not relevant
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            # We do not include the document in filtered_docs
            # We set a flag to indicate that we want to run web search
            web_search = "Yes"
            continue
    return {"documents": filtered_docs, "question": question, "web_search": web_search}


import json


def web_search(state):
    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]

    # Web search
    docs = web_search_tool.invoke({"query": question})
    docs = json.loads(docs)
    web_results = "\n".join([d["snippet"] for d in docs])
    web_results = Document(page_content=web_results)
    print("Web results:", web_results)
    if documents is not None:
        documents.append(web_results)
    else:
        documents = [web_results]
    return {"documents": documents, "question": question}


### Conditional edge


def route_question(state):
    print("---ROUTE QUESTION---")
    question = state["question"]
    print(question)
    source = question_router.invoke({"question": question})
    print(source)
    print(source["datasource"])
    if source["datasource"] == "web_search":
        print("---ROUTE QUESTION TO WEB SEARCH---")
        return "websearch"
    elif source["datasource"] == "vectorstore":
        print("---ROUTE QUESTION TO RAG---")
        return "vectorstore"


def decide_to_generate(state):
    print("---ASSESS GRADED DOCUMENTS---")
    web_search = state["web_search"]

    if web_search == "Yes":
        print("---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH---")
        return "websearch"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


### Conditional edge
correct_cnt = 0

def grade_generation_v_documents_and_question(state):
    global correct_cnt
    print("---CHECK HALLUCINATIONS---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]

    if correct_cnt > 2:
        correct_cnt = 0
        return "useful"

    score = hallucination_grader.invoke(
        {"documents": documents, "generation": generation["explanation"]}
    )
    grade = score["score"]

    # Check hallucination
    if grade == "yes":
        print("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
        # Check question-answering
        print("---GRADE GENERATION vs QUESTION---")
        score = answer_grader.invoke({"question": question, "generation": generation})
        grade = score["score"]
        if grade == "yes":
            print("---DECISION: GENERATION ADDRESSES QUESTION---")
            correct_cnt = 0
            return "useful"
        else:
            print("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
            correct_cnt += 1
            return "not useful"
    else:
        print("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
        correct_cnt += 1
        return "not supported"

In [18]:
from langgraph.graph import END, StateGraph

vanilla_workflow = StateGraph(GraphState)
vanilla_workflow.add_node("retrieve", retrieve)
vanilla_workflow.add_node("generate", generate)

vanilla_workflow.set_entry_point("retrieve")
vanilla_workflow.add_edge("retrieve", "generate")
vanilla_workflow.add_edge("generate", END)

vanilla_app = vanilla_workflow.compile()

# Test
inputs = {
    "question": "A short stay or 23-hour ward in a general and academic children's hospital: are they effective?"
}
out = vanilla_app.invoke(inputs)
answer = out["generation"]["answer"]
print(answer)

---RETRIEVE---
---GENERATE---
yes


In [21]:
def eval_rag(rag_app):
    correct = 0

    for i, pmid in enumerate(pubmedqa_dataset):
        if i == 499:
            break
        print(f"Question {i+1} / 500")
        inputs = {"question": pubmedqa_dataset[pmid]["QUESTION"]}
        out = rag_app.invoke(inputs)
        answer = out["generation"]["answer"]
        if answer == pubmedqa_dataset[pmid]["final_decision"]:
            correct += 1
        print(answer, pubmedqa_dataset[pmid]["final_decision"])
        print("Accuracy: ", correct / (i + 1))

In [169]:
eval_rag(vanilla_app)

Question 1 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  1.0
Question 2 / 500
---RETRIEVE---
---GENERATE---
yes no
Accuracy:  0.5
Question 3 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  0.6666666666666666
Question 4 / 500
---RETRIEVE---
---GENERATE---
no no
Accuracy:  0.75
Question 5 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  0.8
Question 6 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  0.8333333333333334
Question 7 / 500
---RETRIEVE---
---GENERATE---
yes maybe
Accuracy:  0.7142857142857143
Question 8 / 500
---RETRIEVE---
---GENERATE---
yes no
Accuracy:  0.625
Question 9 / 500
---RETRIEVE---
---GENERATE---
no no
Accuracy:  0.6666666666666666
Question 10 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  0.7
Question 11 / 500
---RETRIEVE---
---GENERATE---
yes yes
Accuracy:  0.7272727272727273
Question 12 / 500
---RETRIEVE---
---GENERATE---
no no
Accuracy:  0.75
Question 13 / 500
---RETRIEVE---
---GENERATE---
no yes
Accuracy:  0.6923076923076923

In [186]:
corrective_workflow = StateGraph(GraphState)
corrective_workflow.add_node("retrieve", retrieve)
corrective_workflow.add_node("generate", generate)
corrective_workflow.add_node("grade_documents", grade_documents) # grade documents
corrective_workflow.add_node("websearch", web_search) # grade documents


corrective_workflow.set_entry_point("retrieve")
corrective_workflow.add_edge("retrieve", "grade_documents")
corrective_workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "websearch": "websearch",
        "generate": "generate",
    },
)
corrective_workflow.add_edge("websearch", "generate")
# corrective_workflow.add_conditional_edges(
#     "generate",
#     grade_generation_v_documents_and_question,
#     {
#         "not supported": "generate",
#         "useful": END,
#         "not useful": "websearch",
#     },
# )
corrective_workflow.add_edge("generate", END)

corrective_app = corrective_workflow.compile()

In [187]:
eval_rag(corrective_app)

Question 1 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
---GENERATE---
yes yes
Accuracy:  1.0
Question 2 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
---GENERATE---
yes no
Accuracy:  0.5
Question 3 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH---
---WEB SEARCH---
Web results: page_content='&quot;Aquagenic maladies&quot; could be <strong>a</st

In [19]:
selfrag_workflow = StateGraph(GraphState)
selfrag_workflow.add_node("retrieve", retrieve)
selfrag_workflow.add_node("generate", generate)
selfrag_workflow.add_node("grade_documents", grade_documents) # grade documents
selfrag_workflow.add_node("websearch", web_search) # grade documents


selfrag_workflow.set_entry_point("retrieve")
selfrag_workflow.add_edge("retrieve", "grade_documents")
selfrag_workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "websearch": "websearch",
        "generate": "generate",
    },
)
selfrag_workflow.add_edge("websearch", "generate")
selfrag_workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "websearch",
    },
)

selfrag_app = selfrag_workflow.compile()

In [24]:
eval_rag(selfrag_app)

Question 1 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION vs QUESTION---
---DECISION: GENERATION ADDRESSES QUESTION---
yes yes
Accuracy:  1.0
Question 2 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---ASSESS GRADED DOCUMENTS---
---DECISION: GENERATE---
---GENERATE---
---CHECK HALLUCINATIONS---
---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---
---GRADE GENERATION vs QUESTION---
---DECISION: GENERATION ADDRESSES QUESTION---
yes no
Accuracy:  0.5
Question 3 / 500
---RETRIEVE---
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT REL