Setup + Launch Phoenix Client

In [1]:
import nest_asyncio

nest_asyncio.apply()

import os
from getpass import getpass

import pandas as pd
import phoenix as px
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, set_global_handler
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI

# instantiate key as environment variable, enter your OpenAI API key
os.environ["OPENAI_API_KEY"] = ""

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Launch phoenix observability UI locally
px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x1071aed30>

In [9]:
# Run this if you need to reset the UI at any point

px.close_app()
px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


<phoenix.session.session.ThreadSession at 0x2c4d1c7c0>

In [3]:
# Set global handler from llamaindex
set_global_handler("arize_phoenix")

Load file, being chunking, sample query

In [4]:
# create function to load and chunk pdf given a file path

def load_and_chunk(path):
    reader = SimpleDirectoryReader(input_files=[path])
    documents = reader.load_data()

    # Build index with a chunk_size of 512 - naive
    node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
    nodes = node_parser.get_nodes_from_documents(documents)
    vector_index = VectorStoreIndex(nodes)

    return vector_index

In [5]:
vector_index = load_and_chunk('/Users/reymerekar/Desktop/arize_rag_eval/honors_thesis.pdf')
query_engine = vector_index.as_query_engine()

# Test query
response_vector = query_engine.query("What does the author say about wearables?")
print(response_vector)

# This now should also be logged in the UI

The author discusses the growing popularity of wearables such as Apple Watch and FitBit, highlighting how these devices are generating increasing amounts of data. This data transmission from wearables is part of the Internet of Medical Things (IoMT), which is essentially the healthcare-focused extension of the Internet of Things. Wearables equipped with Wi-Fi enable machine-to-machine interactions, allowing healthcare professionals to gather data for purposes like disease prediction, monitoring patient status, and aiding in drug development.


Generate Q+A pairs for testing and evaluation

In [None]:
# Let's construct a dataframe of just the documents that are in our index
document_chunks_df = pd.DataFrame({"text": [node.get_text() for node in nodes]})
document_chunks_df.head()

In [None]:
# generate questions per chunk

generate_questions_template = """\
Context information is below.

---------------------
{text}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
3 questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."

Output the questions in JSON format with the keys question_1, question_2, question_3.
"""

In [None]:
import json

from phoenix.evals import OpenAIModel, llm_generate


def output_parser(response: str, index: int):
    try:
        return json.loads(response)
    except json.JSONDecodeError as e:
        return {"__error__": str(e)}


questions_df = llm_generate(
    dataframe=document_chunks_df,
    template=generate_questions_template,
    model=OpenAIModel(
        model_name="gpt-3.5-turbo",
    ),
    output_parser=output_parser,
    concurrency=20,
)


questions_df.head()

In [None]:
# Construct a dataframe of the questions and the document chunks
questions_with_document_chunk_df = pd.concat([questions_df, document_chunks_df], axis=1)
questions_with_document_chunk_df = questions_with_document_chunk_df.melt(
    id_vars=["text"], value_name="question"
).drop("variable", axis=1)
# If the above step was interrupted, there might be questions missing. Let's run this to clean up the dataframe.
questions_with_document_chunk_df = questions_with_document_chunk_df[
    questions_with_document_chunk_df["question"].notnull()
]

questions_with_document_chunk_df.head()

# optionally export if desired
questions_with_document_chunk_df.to_csv('questions_with_document_chunk.csv')

In [6]:
# read in csv file that contains chunks along with questions - these questions were generated using llm_generate from phoenix.evals (example above, not ran here)
questions_with_document_chunk_df = pd.read_csv('questions_with_document_chunk.csv')

In [7]:
for _, row in questions_with_document_chunk_df.head(25).iterrows():
    question = row["question"]
    print(question)

What is the title of the thesis written by Reyhan Merekar at CUNY Baruch College?
What is the main focus of Reyhan Merekar's research project as outlined in the context information?
Explain the significance of leveraging artificial intelligence in the context of healthcare as discussed in Chapter 2.
Who is the Faculty Advisor mentioned in the acknowledgements section and what role did they play in the project?
How did the study in the healthcare industry aim to use Machine Learning and analytic methods to predict one-year heart transplantation outcomes?
What is the definition of a heart transplant according to the National Heart, Lung, and Blood Institute?
How is the presence of technology, particularly data science, impacting healthcare, specifically in the field of heart transplantation?
Explain the concept of Machine Learning in the context of healthcare and how it differs from traditional methods of decision-making.
How can virtual visits and telehealth help caregivers in dealing w

In [10]:
# loop over the questions and generate the answers from LLM/llamaindex query engine, limit to 10 for testing purposes
# Should be recorded in Phoenix UI
for _, row in questions_with_document_chunk_df.head(10).iterrows():
    question = row["question"]
    response_vector = query_engine.query(question)
    print(f"Question: {question}\nAnswer: {response_vector.response}\n")

Question: What is the title of the thesis written by Reyhan Merekar at CUNY Baruch College?
Answer: The title of the thesis written by Reyhan Merekar at CUNY Baruch College is "Emerging Technologies in Healthcare: Analysis of UNOS Data Through Machine Learning."

Question: What is the main focus of Reyhan Merekar's research project as outlined in the context information?
Answer: The main focus of Reyhan Merekar's research project is the analysis of UNOS data through machine learning in the context of emerging technologies in healthcare.

Question: Explain the significance of leveraging artificial intelligence in the context of healthcare as discussed in Chapter 2.
Answer: The significance of leveraging artificial intelligence in healthcare lies in its ability to contribute to the evolving landscape of medicine. By utilizing AI, particularly Machine Learning, healthcare providers can move towards a value-based model focused on delivering excellent population health. This shift emphasize

Begin Evaluating RAG Pipeline

In [11]:
from phoenix.session.evaluation import get_retrieved_documents

# create dataframe for retrieved documents right from Phoenix client

retrieved_documents_df = get_retrieved_documents(px.Client())
retrieved_documents_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
60594b1e3a47279a,0,eb461852acb54040761ccfae646d78f5,How is AI being used in healthcare to enhance ...,Merekar 3 \n \nChapter 2: Backgr ound & Additi...,0.882567
60594b1e3a47279a,1,eb461852acb54040761ccfae646d78f5,How is AI being used in healthcare to enhance ...,Merekar 5 \n \n AI aims to mimic human cogniti...,0.874282
5a32119835c66df4,0,4a53a3d8149508df19cd0c36fec1a8d2,How can virtual visits and telehealth help car...,Merekar 4 \n \nVirtual Visits and Wearables \...,0.842277
5a32119835c66df4,1,4a53a3d8149508df19cd0c36fec1a8d2,How can virtual visits and telehealth help car...,Merekar 30 \n \nthe complexity and volume of h...,0.807607
85c9d2090727fd97,0,385813fdca634be3d0a05426255834b5,Explain the concept of Machine Learning in the...,Merekar 3 \n \nChapter 2: Backgr ound & Additi...,0.875977
85c9d2090727fd97,1,385813fdca634be3d0a05426255834b5,Explain the concept of Machine Learning in the...,Merekar 2 \n \nThe p resence of technology in ...,0.853519
ce7515c660a55072,0,974ce0eec69471c544b86c38db285687,"How is the presence of technology, particularl...",Merekar 2 \n \nThe p resence of technology in ...,0.902275
ce7515c660a55072,1,974ce0eec69471c544b86c38db285687,"How is the presence of technology, particularl...",Merekar ii \nAbstract \n The healthcare indu...,0.855199
9e585d5899402d9a,0,55c9011046d3c25180a96d196d343faa,What is the definition of a heart transplant a...,Merekar 1 \n \nChapter 1: Introduction \nHear...,0.860118
9e585d5899402d9a,1,55c9011046d3c25180a96d196d343faa,What is the definition of a heart transplant a...,"DOI.org (Crossref) , \ndoi:10.1016/j.athoracsu...",0.811721


Phoenix's LLM Evals are used to evaluate the relevance of the retrieved documents with regards to the query. Also includes explanations which prompts the LLM to explain it's reasoning. This can be useful for debugging and for figuring out potential corrective actions.

In [13]:
from phoenix.evals import (
    RelevanceEvaluator,
    run_evals,
    OpenAIModel
)

# Scoring for relevance, LLM as a judge

relevance_evaluator = RelevanceEvaluator(OpenAIModel(model="gpt-4-turbo-preview"))

retrieved_documents_relevance_df = run_evals(
    evaluators=[relevance_evaluator],
    dataframe=retrieved_documents_df,
    provide_explanation=True,
    concurrency=20,
)[0]

run_evals |██████████| 20/20 (100.0%) | ⏳ 00:21<00:00 |  1.07s/it


In [14]:
# Take a look at dataframe with relvance scores
retrieved_documents_relevance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,label,score,explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60594b1e3a47279a,0,relevant,1,The question asks how AI is being used in heal...
60594b1e3a47279a,1,relevant,1,The reference text directly addresses the ques...
5a32119835c66df4,0,relevant,1,The question asks how virtual visits and teleh...
5a32119835c66df4,1,unrelated,0,The question asks about how virtual visits and...
85c9d2090727fd97,0,relevant,1,The reference text provides a detailed explana...
85c9d2090727fd97,1,relevant,1,The question asks for an explanation of the co...
ce7515c660a55072,0,relevant,1,The question asks about the impact of technolo...
ce7515c660a55072,1,relevant,1,The question asks about the impact of technolo...
9e585d5899402d9a,0,relevant,1,The question asks for the definition of a hear...
9e585d5899402d9a,1,unrelated,0,The question asks for the definition of a hear...


We can now combine the documents with the relevance evaluations to compute retrieval metrics. These metrics will help us understand how well the RAG system is performing.

In [16]:
documents_with_relevance_df = pd.concat(
    [retrieved_documents_df, retrieved_documents_relevance_df.add_prefix("eval_")], axis=1
)

documents_with_relevance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,context.trace_id,input,reference,document_score,eval_label,eval_score,eval_explanation
context.span_id,document_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
60594b1e3a47279a,0,eb461852acb54040761ccfae646d78f5,How is AI being used in healthcare to enhance ...,Merekar 3 \n \nChapter 2: Backgr ound & Additi...,0.882567,relevant,1,The question asks how AI is being used in heal...
60594b1e3a47279a,1,eb461852acb54040761ccfae646d78f5,How is AI being used in healthcare to enhance ...,Merekar 5 \n \n AI aims to mimic human cogniti...,0.874282,relevant,1,The reference text directly addresses the ques...
5a32119835c66df4,0,4a53a3d8149508df19cd0c36fec1a8d2,How can virtual visits and telehealth help car...,Merekar 4 \n \nVirtual Visits and Wearables \...,0.842277,relevant,1,The question asks how virtual visits and teleh...
5a32119835c66df4,1,4a53a3d8149508df19cd0c36fec1a8d2,How can virtual visits and telehealth help car...,Merekar 30 \n \nthe complexity and volume of h...,0.807607,unrelated,0,The question asks about how virtual visits and...
85c9d2090727fd97,0,385813fdca634be3d0a05426255834b5,Explain the concept of Machine Learning in the...,Merekar 3 \n \nChapter 2: Backgr ound & Additi...,0.875977,relevant,1,The reference text provides a detailed explana...
85c9d2090727fd97,1,385813fdca634be3d0a05426255834b5,Explain the concept of Machine Learning in the...,Merekar 2 \n \nThe p resence of technology in ...,0.853519,relevant,1,The question asks for an explanation of the co...
ce7515c660a55072,0,974ce0eec69471c544b86c38db285687,"How is the presence of technology, particularl...",Merekar 2 \n \nThe p resence of technology in ...,0.902275,relevant,1,The question asks about the impact of technolo...
ce7515c660a55072,1,974ce0eec69471c544b86c38db285687,"How is the presence of technology, particularl...",Merekar ii \nAbstract \n The healthcare indu...,0.855199,relevant,1,The question asks about the impact of technolo...
9e585d5899402d9a,0,55c9011046d3c25180a96d196d343faa,What is the definition of a heart transplant a...,Merekar 1 \n \nChapter 1: Introduction \nHear...,0.860118,relevant,1,The question asks for the definition of a hear...
9e585d5899402d9a,1,55c9011046d3c25180a96d196d343faa,What is the definition of a heart transplant a...,"DOI.org (Crossref) , \ndoi:10.1016/j.athoracsu...",0.811721,unrelated,0,The question asks for the definition of a hear...


Finally, export and aggregate results for reporting

In [17]:
# Export and aggregate results

documents_with_relevance_df.to_csv('documents_with_relevance.csv')
score_counts = documents_with_relevance_df.groupby('eval_label').size()

score_counts
    

eval_label
relevant     17
unrelated     3
dtype: int64

In [6]:
# Close connection fo Phoenix client

px.close_app()