<a href="https://colab.research.google.com/github/sarunsmenon/llm/blob/main/LLM_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [5]:
! pip install -q deepeval python-dotenv openai langchain-openai langchain langchain_community faiss-cpu  uuid7 langgraph chromadb aiobotocore

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.9/78.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/13.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/13.3 MB[0m [31m158.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m12.4/13.3 MB[0m [31m190.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m13.3/13.3 MB[0m [31m185.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load Libraries

In [6]:
from google.colab import userdata
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import chromadb
import random

In [24]:
from deepeval.synthesizer import Synthesizer
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase

In [8]:
import requests
import chromadb
from bs4 import BeautifulSoup
import openai
import hashlib
import time
import re

# Load Variables

In [9]:
os.environ['OPENAI_API_KEY'] = userdata.get('open_ai_key')

In [10]:
llm_model = 'gpt-3.5-turbo-1106'
llm = ChatOpenAI(model=llm_model, temperature=0)

In [11]:
# Start crawling from the initial URL
ignore_lst = []
max_pg_lmt = 5000
fldr = '/content/drive/MyDrive/GenAI - Hack/mhdb'

# Create Chroma Db

In [None]:


def download_and_store_urls_with_openai(urls, openai_api_key, collection_name="web_content"):
    """
    Download content from URLs and store in ChromaDB using OpenAI embeddings

    Args:
        urls: List of URLs to download
        openai_api_key: Your OpenAI API key
        collection_name: Name of the ChromaDB collection
    """
    # Set OpenAI API key
    openai.api_key = openai_api_key

    # Initialize ChromaDB
    client = chromadb.Client()

    # Create or get collection - note we're not providing embeddings
    # ChromaDB will use the documents to create embeddings
    collection = client.get_or_create_collection(
        name=collection_name,
        metadata={"description": "Web content with OpenAI embeddings"}
    )

    # Process each URL
    for url in urls:
        try:
            print(f"Processing {url}")

            # Download content with timeout and headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text content
            for script in soup(["script", "style", "header", "footer", "nav"]):
                script.extract()

            # Get text and clean it
            text = soup.get_text()
            text = re.sub(r'\s+', ' ', text).strip()

            # Break into chunks
            chunks = split_into_chunks(text, chunk_size=1000)

            # Process each chunk
            for i, chunk in enumerate(chunks):
                if not chunk.strip():
                    continue

                # Create a unique ID
                doc_id = hashlib.md5(f"{url}_{i}".encode()).hexdigest()

                # Generate OpenAI embedding
                embedding_response = openai.embeddings.create(
                    model="text-embedding-ada-002",
                    input=chunk
                )
                embedding = embedding_response.data[0].embedding

                # Store in ChromaDB
                collection.add(
                    ids=[doc_id],
                    documents=[chunk],
                    embeddings=[embedding],
                    metadatas=[{
                        "url": url,
                        "chunk_number": i,
                        "total_chunks": len(chunks),
                        "timestamp": time.time()
                    }]
                )

            print(f"Successfully stored {len(chunks)} chunks from {url}")

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            continue

        # Be nice to servers
        time.sleep(1)

    print(f"Completed processing {len(urls)} URLs")
    return collection

def split_into_chunks(text, chunk_size=1000, overlap=100):
    """Split text into overlapping chunks of approximately chunk_size characters"""
    chunks = []
    if len(text) <= chunk_size:
        chunks.append(text)
    else:
        start = 0
        while start < len(text):
            end = start + chunk_size
            if end >= len(text):
                chunks.append(text[start:])
            else:
                # Try to find a good breaking point
                break_point = text.rfind(". ", start, end)
                if break_point != -1:
                    chunks.append(text[start:break_point+1])
                    start = break_point + 1 - overlap
                else:
                    chunks.append(text[start:end])
                    start = end - overlap
            start = max(start, 0)
    return chunks

# Example usage
urls = [
    "https://example.com",
    "https://en.wikipedia.org/wiki/Vector_database"
]

collection = download_and_store_urls_with_openai(
    urls,
    openai_api_key="your-openai-api-key"
)

# Load Contexts

## Load Chroma Collection

In [12]:
chroma_client = chromadb.PersistentClient(path=fldr)
collection = chroma_client.get_or_create_collection(name="semantic")

## Load Contexts

In [13]:
all_contexts = collection.get(include=["documents"])["documents"]

In [14]:
sample_context = random.sample(all_contexts, 10)

In [15]:
len(sample_context)

10

In [16]:
sample_context_lst = []
for context in sample_context:
  sample_context_lst.append([context])

# Synthesizer

In [27]:
synthesizer = Synthesizer(model=llm_model)
test_cases = synthesizer.generate_goldens_from_contexts(
    # Provide a list of context for synthetic data generation
    contexts=sample_context_lst,
    max_goldens_per_context = 2
)

✨ Generating up to 20 goldens using DeepEval (using gpt-3.5-turbo-1106, method=default): 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


In [28]:
dataframe = synthesizer.to_pandas()
print(dataframe)

                                                input actual_output  \
0   What is the method for reaching out to your tr...          None   
1   What support is available for managing increas...          None   
2   Provide a summary of the findings that highlig...          None   
3   How can updates for the Services Directory be ...          None   
4   Imagine a situation where you urgently needed ...          None   
5   What self-administered tool can be used to mon...          None   
6   Examine the content of the Trauma Informed Lea...          None   
7   How many mental health plans does the State ha...          None   
8   Which publication contains a study exploring e...          None   
9   How does stigma and abuse contribute to higher...          None   
10  What are the obstacles causing the fragmentati...          None   
11  Does the Department of Health oversee the mana...          None   
12  What is the objective of the Department of Hea...          None   
13  Wh

# Evaluation

In [19]:
synthesizer.synthetic_goldens[0].input

'Where can I submit updates for the Services Directory Aged Services, including metropolitan and rural maps, and contact details for general enquiries?'

In [20]:
synthesizer.synthetic_goldens[0]

Golden(input='Where can I submit updates for the Services Directory Aged Services, including metropolitan and rural maps, and contact details for general enquiries?', actual_output=None, expected_output='You can submit updates for the Services Directory Aged Services, including metropolitan and rural maps, by clicking on the provided link. For general enquiries, you can contact the Department of Health at 61 3 90960000.', context=['Click \r\n                    here to send us your update to the Services Directory Aged Services - Metropolitan Maps Central & Outer\r\n              East Dandenong Inner & North\r\n              West Inner\r\n              South East Inner\r\n              Urban East & Northcote Mid & South\r\n              West Middle South Northern & North\r\n              East Peninsula Aged Services -\r\n                Rural Maps Barwon Gippsland Glenelg (South Western) Goulburn & Southern Grampians Loddon Campaspe / Southern Mallee North East Hume Northern\r\n       

In [None]:
type(synthesizer.synthetic_goldens[0])

In [34]:

# Prepare evaluation metrics
metrics_lst = [
    AnswerRelevancyMetric(threshold=0.7),
    # FaithfulnessMetric(threshold=0.7)
]

# Create test cases from goldens
test_case = []
for golden in synthesizer.synthetic_goldens:
    res = llm.invoke(golden.input).content
    test_case = LLMTestCase(input=golden.input, actual_output=res, expected_output=golden.expected_output)
    print(test_case)
    test_cases.append(test_case)
    evaluate(test_cases=[test_case], metrics=metrics_lst)



# Evaluate end-to-end
# evaluate(test_cases=test_cases, metrics=[AnswerRelevancyMetric()])



LLMTestCase(input='What is the method for reaching out to your treating team for help?', actual_output="1. Contact your primary care physician or therapist: If you have a primary care physician or therapist, reach out to them first. They can provide guidance and support or refer you to a specialist if needed.\n\n2. Use the contact information provided by your treating team: If you have been given contact information for your treating team, such as a phone number or email address, use that to reach out for help.\n\n3. Schedule an appointment: If you have a regular appointment scheduled with your treating team, make sure to attend it and discuss your concerns during the appointment.\n\n4. Utilize telehealth options: Many healthcare providers offer telehealth services, allowing you to connect with your treating team remotely through video calls or phone consultations.\n\n5. Seek emergency help if needed: If you are in crisis or need immediate assistance, do not hesitate to call emergency 

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:09,  9.83s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.8235294117647058, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 0.82 because the response effectively addresses the method for reaching out to the treating team, but includes some irrelevant statements about emotional encouragement and the role of the treating team, which do not directly answer the question., error: None)

For test case:

  - input: What is the method for reaching out to your treating team for help?
  - actual output: 1. Contact your primary care physician or therapist: If you have a primary care physician or therapist, reach out to them first. They can provide guidance and support or refer you to a specialist if needed.

2. Use the contact information provided by your treating team: If you have been given contact information for your treating team, such as a phone number or email address, use that to reach out for help.

3. Schedule an appointment: If you have a regular appointment sch




LLMTestCase(input='What support is available for managing increased alcohol or substance use during the pandemic?', actual_output="1. Online support groups: Many organizations and support groups have moved their meetings and support sessions online, providing a safe space for individuals to connect with others who are experiencing similar challenges.\n\n2. Telehealth services: Many healthcare providers are offering telehealth services for individuals struggling with alcohol or substance use, providing access to therapy, counseling, and medication management from the comfort of their own homes.\n\n3. Hotlines and helplines: Various hotlines and helplines are available for individuals in need of immediate support or guidance, providing access to trained professionals who can offer assistance and resources.\n\n4. Virtual therapy and counseling: Many therapists and counselors are offering virtual sessions to support individuals in managing their alcohol or substance use, providing a safe a

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.69s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the question about support for managing increased alcohol or substance use during the pandemic, with no irrelevant information included. Great job on staying focused and relevant!, error: None)

For test case:

  - input: What support is available for managing increased alcohol or substance use during the pandemic?
  - actual output: 1. Online support groups: Many organizations and support groups have moved their meetings and support sessions online, providing a safe space for individuals to connect with others who are experiencing similar challenges.

2. Telehealth services: Many healthcare providers are offering telehealth services for individuals struggling with alcohol or substance use, providing access to therapy, counseling, and medication management from the comfort of their own homes.

3. Hotlines and 




LLMTestCase(input='Provide a summary of the findings that highlight how eating disorders impact diverse lesbian, gay, and bisexual populations.', actual_output='Research has shown that eating disorders disproportionately affect lesbian, gay, and bisexual (LGB) populations compared to their heterosexual counterparts. LGB individuals are more likely to experience body dissatisfaction, disordered eating behaviors, and higher rates of eating disorders. This is often attributed to the unique stressors and discrimination faced by LGB individuals, such as minority stress, internalized homophobia, and societal pressure to conform to narrow beauty standards. Additionally, LGB individuals may face barriers to accessing appropriate and affirming treatment for their eating disorders. It is important for healthcare providers to be aware of these disparities and provide culturally competent care for LGB individuals with eating disorders.', expected_output='The findings suggest that eating disorders 

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.97s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output perfectly addresses the input by providing a comprehensive summary of how eating disorders affect diverse lesbian, gay, and bisexual populations. There are no irrelevant statements, ensuring the response is focused and relevant., error: None)

For test case:

  - input: Provide a summary of the findings that highlight how eating disorders impact diverse lesbian, gay, and bisexual populations.
  - actual output: Research has shown that eating disorders disproportionately affect lesbian, gay, and bisexual (LGB) populations compared to their heterosexual counterparts. LGB individuals are more likely to experience body dissatisfaction, disordered eating behaviors, and higher rates of eating disorders. This is often attributed to the unique stressors and discrimination faced by LGB individuals, such as minority stress, internalized homo




LLMTestCase(input='How can updates for the Services Directory be submitted to the Department of Health and Human Services, State Government of Victoria, Australia?', actual_output="Updates for the Services Directory can be submitted to the Department of Health and Human Services, State Government of Victoria, Australia through the following methods:\n\n1. Online submission: The Department of Health and Human Services may have an online portal or form where updates can be submitted. This could include changes to contact information, services offered, or any other relevant details.\n\n2. Email: You can send an email to the Department of Health and Human Services with the updated information. The email address for submitting updates may be available on their official website or through direct contact with the department.\n\n3. Phone: You can call the Department of Health and Human Services and speak to a representative about submitting updates for the Services Directory. They may provide 

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.20s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the query without any irrelevant information, providing a clear and direct answer to the question about submitting updates to the Services Directory., error: None)

For test case:

  - input: How can updates for the Services Directory be submitted to the Department of Health and Human Services, State Government of Victoria, Australia?
  - actual output: Updates for the Services Directory can be submitted to the Department of Health and Human Services, State Government of Victoria, Australia through the following methods:

1. Online submission: The Department of Health and Human Services may have an online portal or form where updates can be submitted. This could include changes to contact information, services offered, or any other relevant details.

2. Email: You can send an email to the Department of Health 




LLMTestCase(input="Imagine a situation where you urgently needed to contact Barwon Health's triage service. How will you find their contact information?", actual_output='If I urgently needed to contact Barwon Health\'s triage service, I would first search for their contact information online. I would visit their official website and look for a "Contact Us" or "Emergency Services" section. If I couldn\'t find the information there, I would search for their phone number on a search engine or call directory assistance for their contact details. If I still couldn\'t find the information, I would consider reaching out to a local hospital or healthcare provider for assistance in getting in touch with Barwon Health\'s triage service.', expected_output="You can find Barwon Health's triage service contact information by looking under the contact section of their website or by calling 1300 094 187.", context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.88s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the input by providing clear and relevant information on how to find Barwon Health's triage service contact details. There are no irrelevant statements, making it a precise and focused answer., error: None)

For test case:

  - input: Imagine a situation where you urgently needed to contact Barwon Health's triage service. How will you find their contact information?
  - actual output: If I urgently needed to contact Barwon Health's triage service, I would first search for their contact information online. I would visit their official website and look for a "Contact Us" or "Emergency Services" section. If I couldn't find the information there, I would search for their phone number on a search engine or call directory assistance for their contact details. If I still couldn't find the information, I would conside




LLMTestCase(input='What self-administered tool can be used to monitor and assess alcohol intake for potential related health risks?', actual_output='A self-administered tool that can be used to monitor and assess alcohol intake for potential related health risks is the Alcohol Use Disorders Identification Test (AUDIT). This is a simple and widely used screening tool that can help individuals assess their alcohol consumption and identify potential risks for alcohol-related health problems.', expected_output='You can use the AUDIT tool to monitor and assess alcohol intake for potential related health risks.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, token_cost=None, completion_time=None, name=None)


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:04,  4.38s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the question by providing a relevant self-administered tool for monitoring alcohol intake, with no irrelevant information included. Great job on staying focused and on point!, error: None)

For test case:

  - input: What self-administered tool can be used to monitor and assess alcohol intake for potential related health risks?
  - actual output: A self-administered tool that can be used to monitor and assess alcohol intake for potential related health risks is the Alcohol Use Disorders Identification Test (AUDIT). This is a simple and widely used screening tool that can help individuals assess their alcohol consumption and identify potential risks for alcohol-related health problems.
  - expected output: You can use the AUDIT tool to monitor and assess alcohol intake for potential related health risks.
  - co




LLMTestCase(input='Examine the content of the Trauma Informed Leadership course presented by Foundation House and MHV.', actual_output='The Trauma Informed Leadership course presented by Foundation House and MHV focuses on providing leaders with the knowledge and skills to create trauma-informed organizations. The course content includes understanding the impact of trauma on individuals and communities, recognizing trauma symptoms and responses, and implementing trauma-informed practices in leadership roles.\n\nThe course covers topics such as the neurobiology of trauma, the prevalence of trauma in different populations, and the importance of creating safe and supportive environments for those who have experienced trauma. It also addresses the role of leadership in promoting trauma-informed care, including the need for self-awareness, empathy, and effective communication.\n\nParticipants in the course will learn about the principles of trauma-informed care, including safety, trustworth

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:10, 10.09s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly aligns with the input, addressing the content of the Trauma Informed Leadership course without any irrelevant information. Great job on maintaining focus and clarity!, error: None)

For test case:

  - input: Examine the content of the Trauma Informed Leadership course presented by Foundation House and MHV.
  - actual output: The Trauma Informed Leadership course presented by Foundation House and MHV focuses on providing leaders with the knowledge and skills to create trauma-informed organizations. The course content includes understanding the impact of trauma on individuals and communities, recognizing trauma symptoms and responses, and implementing trauma-informed practices in leadership roles.

The course covers topics such as the neurobiology of trauma, the prevalence of trauma in different populations, and the impo




LLMTestCase(input='How many mental health plans does the State have, and what are the implications of this fragmented system?', actual_output='The number of mental health plans and their implications can vary by state. In general, a fragmented mental health system can lead to difficulties in accessing care, lack of coordination between different providers, and challenges in providing comprehensive and integrated care for individuals with mental health needs. This can result in gaps in services, inconsistent quality of care, and increased burden on individuals and families seeking mental health support. It can also lead to increased costs for the healthcare system as a whole. Efforts to integrate and coordinate mental health plans and services can help address these challenges and improve the overall quality and accessibility of mental health care.', expected_output='The State has seven different mental health plans: one from the Victorian Government and six from the Primary Health Netw

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.85s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the input question without any irrelevant statements, providing a clear and concise explanation of the number of mental health plans and their implications. Great job on staying focused and relevant!, error: None)

For test case:

  - input: How many mental health plans does the State have, and what are the implications of this fragmented system?
  - actual output: The number of mental health plans and their implications can vary by state. In general, a fragmented mental health system can lead to difficulties in accessing care, lack of coordination between different providers, and challenges in providing comprehensive and integrated care for individuals with mental health needs. This can result in gaps in services, inconsistent quality of care, and increased burden on individuals and families seeking mental he




LLMTestCase(input='Which publication contains a study exploring eating disorders in various LGBTQ+ communities?', actual_output='The publication "Journal of Gay & Lesbian Mental Health" contains a study exploring eating disorders in various LGBTQ+ communities.', expected_output='The publication "Eating disorders in diverse lesbian, gay, and bisexual populations" explores eating disorders in various LGBTQ+ communities. This study was published in the International Journal of Eating Disorders in 2007.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, token_cost=None, completion_time=None, name=None)


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.85s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the input without any irrelevant statements, showcasing a clear and direct answer to the question asked. Great job!, error: None)

For test case:

  - input: Which publication contains a study exploring eating disorders in various LGBTQ+ communities?
  - actual output: The publication "Journal of Gay & Lesbian Mental Health" contains a study exploring eating disorders in various LGBTQ+ communities.
  - expected output: The publication "Eating disorders in diverse lesbian, gay, and bisexual populations" explores eating disorders in various LGBTQ+ communities. This study was published in the International Journal of Eating Disorders in 2007.
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate







LLMTestCase(input='How does stigma and abuse contribute to higher mental illness rates among LGBTIQA+ Australians?', actual_output='Stigma and abuse contribute to higher mental illness rates among LGBTIQA+ Australians in several ways. Firstly, the experience of stigma and discrimination can lead to chronic stress, anxiety, and depression, which are known risk factors for mental illness. LGBTIQA+ individuals may face discrimination in various aspects of their lives, including at work, in their communities, and within their families, which can have a significant impact on their mental health.\n\nAdditionally, the fear of stigma and discrimination can lead to social isolation and a lack of support, which can exacerbate mental health issues. LGBTIQA+ individuals may also internalize negative societal attitudes towards their sexual orientation or gender identity, leading to feelings of shame, self-hatred, and low self-esteem, all of which are associated with mental health problems.\n\nFurth

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.27s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the question about the impact of stigma and abuse on mental illness rates among LGBTIQA+ Australians, with no irrelevant statements present. Great job on staying focused and relevant!, error: None)

For test case:

  - input: How does stigma and abuse contribute to higher mental illness rates among LGBTIQA+ Australians?
  - actual output: Stigma and abuse contribute to higher mental illness rates among LGBTIQA+ Australians in several ways. Firstly, the experience of stigma and discrimination can lead to chronic stress, anxiety, and depression, which are known risk factors for mental illness. LGBTIQA+ individuals may face discrimination in various aspects of their lives, including at work, in their communities, and within their families, which can have a significant impact on their mental health.

Additionally,




LLMTestCase(input='What are the obstacles causing the fragmentation of the current mental health system?', actual_output='1. Lack of coordination and communication between different mental health service providers and organizations.\n2. Insufficient funding and resources for mental health services, leading to limited access and long wait times for treatment.\n3. Stigma and discrimination surrounding mental illness, which can prevent individuals from seeking help and accessing appropriate care.\n4. Inadequate training and education for healthcare professionals in identifying and treating mental health issues.\n5. Fragmented insurance and payment systems, which can create barriers to accessing comprehensive and continuous mental health care.\n6. Limited integration of mental health services with other healthcare and social service systems, leading to disjointed and incomplete care for individuals with mental health needs.\n7. Inequities in access to mental health services based on factor

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:08,  8.78s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the question about the obstacles causing fragmentation in the mental health system without any irrelevant information. Great job on staying focused and relevant!, error: None)

For test case:

  - input: What are the obstacles causing the fragmentation of the current mental health system?
  - actual output: 1. Lack of coordination and communication between different mental health service providers and organizations.
2. Insufficient funding and resources for mental health services, leading to limited access and long wait times for treatment.
3. Stigma and discrimination surrounding mental illness, which can prevent individuals from seeking help and accessing appropriate care.
4. Inadequate training and education for healthcare professionals in identifying and treating mental health issues.
5. Fragmented insuran




LLMTestCase(input='Does the Department of Health oversee the management of this website?', actual_output="I'm sorry, I cannot provide real-time information about specific websites. It's best to check the website itself or contact the Department of Health directly for information about their oversight of specific websites.", expected_output='Yes, the Department of Health and Human Services, State Government of Victoria, Australia manages and authorizes the website.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, token_cost=None, completion_time=None, name=None)


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:06,  6.29s/test case]



Metrics Summary

  - ❌ Answer Relevancy (score: 0.5, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 0.50 because while the response partially addresses the question, it includes irrelevant suggestions that do not directly confirm or deny the Department of Health's role in managing the website. This lack of direct relevance prevents a higher score., error: None)

For test case:

  - input: Does the Department of Health oversee the management of this website?
  - actual output: I'm sorry, I cannot provide real-time information about specific websites. It's best to check the website itself or contact the Department of Health directly for information about their oversight of specific websites.
  - expected output: Yes, the Department of Health and Human Services, State Government of Victoria, Australia manages and authorizes the website.
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 0.00% pass rate







LLMTestCase(input="What is the objective of the Department of Health's funding for MHV Associates and mental health professionals?", actual_output="The objective of the Department of Health's funding for MHV Associates and mental health professionals is to support and improve mental health services and resources for individuals in need. This funding aims to enhance access to mental health care, provide training and support for mental health professionals, and promote mental wellness and recovery for those experiencing mental health challenges. Additionally, the funding may also be used to develop and implement programs and initiatives aimed at addressing mental health disparities and promoting overall mental well-being in communities.", expected_output="The objective of the Department of Health's funding is to provide Trauma Informed Leadership training for MHV Associates and those working in state-funded and community mental health services.", context=None, retrieval_context=None, add

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.46s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the objective of the Department of Health's funding for MHV Associates and mental health professionals without any irrelevant statements. Great job on staying focused and relevant!, error: None)

For test case:

  - input: What is the objective of the Department of Health's funding for MHV Associates and mental health professionals?
  - actual output: The objective of the Department of Health's funding for MHV Associates and mental health professionals is to support and improve mental health services and resources for individuals in need. This funding aims to enhance access to mental health care, provide training and support for mental health professionals, and promote mental wellness and recovery for those experiencing mental health challenges. Additionally, the funding may also be used to develop and impleme




LLMTestCase(input='What factors contribute to the increased mental illness and distress rates among LGBTIQA+ Australians compared to their peers?', actual_output='There are several factors that contribute to the increased mental illness and distress rates among LGBTIQA+ Australians compared to their peers, including:\n\n1. Discrimination and stigma: LGBTIQA+ individuals often face discrimination and stigma in various aspects of their lives, including at school, work, and within their communities. This can lead to feelings of rejection, isolation, and low self-esteem, which can contribute to mental health issues.\n\n2. Minority stress: LGBTIQA+ individuals may experience minority stress, which is the chronic stress that comes from being a member of a marginalized group. This can lead to higher rates of anxiety, depression, and other mental health issues.\n\n3. Bullying and harassment: LGBTIQA+ individuals are more likely to experience bullying and harassment, both in person and online, 

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:13, 13.23s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the factors contributing to increased mental illness and distress rates among LGBTIQA+ Australians without any irrelevant statements. Great job on staying focused and relevant!, error: None)

For test case:

  - input: What factors contribute to the increased mental illness and distress rates among LGBTIQA+ Australians compared to their peers?
  - actual output: There are several factors that contribute to the increased mental illness and distress rates among LGBTIQA+ Australians compared to their peers, including:

1. Discrimination and stigma: LGBTIQA+ individuals often face discrimination and stigma in various aspects of their lives, including at school, work, and within their communities. This can lead to feelings of rejection, isolation, and low self-esteem, which can contribute to mental health issues.






LLMTestCase(input='What specific strategies can be implemented to enhance the effectiveness of the current practices in soliciting diverse viewpoints and experiences?', actual_output='1. Actively seek out diverse perspectives: Actively reach out to individuals from different backgrounds and experiences to ensure that a wide range of viewpoints are represented in decision-making processes.\n\n2. Create a safe and inclusive environment: Foster an environment where individuals feel comfortable sharing their perspectives without fear of judgment or discrimination. This can be achieved through training, open communication, and promoting a culture of respect and inclusion.\n\n3. Use diverse communication channels: Utilize a variety of communication channels, such as surveys, focus groups, and one-on-one meetings, to gather input from a diverse range of individuals.\n\n4. Provide training and education: Offer training and education on diversity and inclusion to help individuals understand the

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.69s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly aligns with the input, providing relevant strategies to enhance the effectiveness of current practices in soliciting diverse viewpoints and experiences. There are no irrelevant statements, showcasing a clear and focused answer., error: None)

For test case:

  - input: What specific strategies can be implemented to enhance the effectiveness of the current practices in soliciting diverse viewpoints and experiences?
  - actual output: 1. Actively seek out diverse perspectives: Actively reach out to individuals from different backgrounds and experiences to ensure that a wide range of viewpoints are represented in decision-making processes.

2. Create a safe and inclusive environment: Foster an environment where individuals feel comfortable sharing their perspectives without fear of judgment or discrimination. This can be a




LLMTestCase(input='\nExamine the potential impacts of emerging accommodation providers, like hostels, on homelessness services and consider their effects on the wellbeing of multicultural communities.', actual_output='The emergence of new accommodation providers, such as hostels, can have both positive and negative impacts on homelessness services and the wellbeing of multicultural communities.\n\nPositive impacts:\n1. Increased availability of affordable accommodation: Hostels can provide affordable accommodation options for individuals experiencing homelessness, especially those from multicultural communities who may face additional barriers in accessing traditional housing options.\n\n2. Support services: Some hostels may offer support services such as counseling, job training, and language classes, which can benefit individuals from multicultural communities who may need additional support in navigating a new country and culture.\n\n3. Diverse community integration: Hostels can pro

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.36s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addresses the potential impacts of emerging accommodation providers on homelessness services and the wellbeing of multicultural communities, without any irrelevant statements. Great job!, error: None)

For test case:

  - input: 
Examine the potential impacts of emerging accommodation providers, like hostels, on homelessness services and consider their effects on the wellbeing of multicultural communities.
  - actual output: The emergence of new accommodation providers, such as hostels, can have both positive and negative impacts on homelessness services and the wellbeing of multicultural communities.

Positive impacts:
1. Increased availability of affordable accommodation: Hostels can provide affordable accommodation options for individuals experiencing homelessness, especially those from multicultural communities who 




LLMTestCase(input='Describe the wide-ranging activities and accomplishments of VICSERV during the 2010-2011 period.', actual_output="During the 2010-2011 period, VICSERV, the peak body for community managed mental health services in Victoria, Australia, engaged in a wide range of activities and accomplishments. Some of the key highlights include:\n\n1. Advocacy and policy development: VICSERV played a leading role in advocating for the rights and needs of people living with mental illness. They actively engaged with government and other stakeholders to influence policy and funding decisions, and to ensure that the voices of consumers and carers were heard.\n\n2. Training and workforce development: VICSERV provided a range of training and professional development opportunities for mental health workers, including workshops, seminars, and conferences. They also worked to promote best practice in the sector and to build the capacity of the mental health workforce.\n\n3. Consumer and carer

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:34, 34.94s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly aligns with the input, providing a comprehensive overview of VICSERV's activities and accomplishments during the 2010-2011 period without any irrelevant information. Great job!, error: None)

For test case:

  - input: Describe the wide-ranging activities and accomplishments of VICSERV during the 2010-2011 period.
  - actual output: During the 2010-2011 period, VICSERV, the peak body for community managed mental health services in Victoria, Australia, engaged in a wide range of activities and accomplishments. Some of the key highlights include:

1. Advocacy and policy development: VICSERV played a leading role in advocating for the rights and needs of people living with mental illness. They actively engaged with government and other stakeholders to influence policy and funding decisions, and to ensure that the voices of




LLMTestCase(input='Share your experiences and thoughts on potential enhancements in the existing procedures.', actual_output='One potential enhancement in existing procedures could be the implementation of technology to streamline and automate certain tasks. For example, in a business setting, using software to automate repetitive administrative tasks could free up time for employees to focus on more strategic and creative work. This could lead to increased productivity and efficiency.\n\nAnother potential enhancement could be the introduction of more flexible work arrangements. With the rise of remote work and flexible scheduling, companies could consider allowing employees to work from home or adjust their hours to better accommodate their personal lives. This could lead to higher job satisfaction and better work-life balance for employees.\n\nIn healthcare, an enhancement could be the implementation of telemedicine services to provide more accessible and convenient care for patients

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:10, 10.42s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly aligns with the input, providing relevant insights and suggestions for enhancing existing procedures without any irrelevant statements. Great job!, error: None)

For test case:

  - input: Share your experiences and thoughts on potential enhancements in the existing procedures.
  - actual output: One potential enhancement in existing procedures could be the implementation of technology to streamline and automate certain tasks. For example, in a business setting, using software to automate repetitive administrative tasks could free up time for employees to focus on more strategic and creative work. This could lead to increased productivity and efficiency.

Another potential enhancement could be the introduction of more flexible work arrangements. With the rise of remote work and flexible scheduling, companies could consi




LLMTestCase(input='What lack of support for students did Dr. Correa-Velez note during COVID-19, while considering the implications of language barriers in psychiatric diagnoses for migrants and refugees?', actual_output='Dr. Correa-Velez noted that during COVID-19, there was a lack of support for students, particularly those from migrant and refugee backgrounds who faced language barriers. This lack of support could have implications for their mental health, as language barriers may affect the accuracy of psychiatric diagnoses and access to appropriate mental health services.', expected_output='Dr. Correa-Velez noted a lack of supports for students and how language barriers could undermine service and better health outcomes for migrants and refugees, with language proficiency particularly important in psychiatric diagnoses.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, token_cost=None, completion_time=None, name

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:05,  5.02s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response perfectly addressed the question without any irrelevant statements, demonstrating a clear understanding of the topic and maintaining focus on the key issues raised., error: None)

For test case:

  - input: What lack of support for students did Dr. Correa-Velez note during COVID-19, while considering the implications of language barriers in psychiatric diagnoses for migrants and refugees?
  - actual output: Dr. Correa-Velez noted that during COVID-19, there was a lack of support for students, particularly those from migrant and refugee backgrounds who faced language barriers. This lack of support could have implications for their mental health, as language barriers may affect the accuracy of psychiatric diagnoses and access to appropriate mental health services.
  - expected output: Dr. Correa-Velez noted a lack of supports for s




LLMTestCase(input='Imagine a year where VICSERV facilitated consultations, represented sector issues, and provided workplace consultations to enhance mental health services.', actual_output="In this hypothetical year, VICSERV, as a leading mental health advocacy organization, played a crucial role in facilitating consultations with various stakeholders in the mental health sector. These consultations aimed to gather input and feedback from individuals with lived experience, mental health professionals, and community organizations to better understand the current challenges and needs within the mental health system.\n\nAdditionally, VICSERV actively represented sector issues at various forums, including government meetings, policy discussions, and public events. By advocating for improved mental health services and resources, VICSERV worked to ensure that the voices of those affected by mental illness were heard and considered in decision-making processes.\n\nFurthermore, VICSERV provid

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:07,  7.42s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output perfectly aligns with the input, addressing all aspects without any irrelevant statements. Great job on maintaining focus and relevance!, error: None)

For test case:

  - input: Imagine a year where VICSERV facilitated consultations, represented sector issues, and provided workplace consultations to enhance mental health services.
  - actual output: In this hypothetical year, VICSERV, as a leading mental health advocacy organization, played a crucial role in facilitating consultations with various stakeholders in the mental health sector. These consultations aimed to gather input and feedback from individuals with lived experience, mental health professionals, and community organizations to better understand the current challenges and needs within the mental health system.

Additionally, VICSERV actively represented sector issues 




In [35]:
# Evaluate end-to-end
evaluate(test_cases=test_cases, metrics=[AnswerRelevancyMetric()])


ValueError: You cannot supply a mixture of `LLMTestCase`/`MLLMTestCase`(s) and `ConversationalTestCase`(s) as the list of test cases.

# Testing

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric()
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Bison",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)

Output()

0.0


In [None]:

from deepeval.synthesizer import Synthesizer
from deepeval.dataset import EvaluationDataset

synthesizer = Synthesizer()
goldens = synthesizer.generate_goldens_from_docs(document_paths=['/content/Diwali - Wikipedia.pdf'])

dataset = EvaluationDataset(goldens=goldens)

✨ 🚀 ✨ Loading Documents: 100%|██████████| 1/1 [00:09<00:00,  9.82s/it]
✨ 📚 ✨ Chunking Documents: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
✨ 🧩 ✨ Generating Contexts:   0%|          | 0/9 [00:00<?, ?it/s]
✨ 🧩 ✨ Generating Contexts:  56%|█████▌    | 5/9 [00:01<00:01,  3.90it/s]
  ✨ 🫗 ✨ Filling Contexts:  17%|█▋        | 1/6 [00:02<00:11,  2.39s/it][A
  ✨ 🫗 ✨ Filling Contexts:  50%|█████     | 3/6 [00:02<00:02,  1.31it/s][A
  ✨ 🫗 ✨ Filling Contexts:  83%|████████▎ | 5/6 [00:03<00:00,  1.77it/s][A
✨ 🧩 ✨ Generating Contexts: 100%|██████████| 9/9 [00:03<00:00,  2.54it/s]


✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-embedding-3-small, method=docs):  17%|█▋        | 1/6 [00:08<00:42,  8.48s/it]ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 28947, Requested 2940. Please try again in 3.774s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 28683, Requested 3198. Please try again in 3.762s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-

In [None]:
dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='How do global Diwali festivities in the UK, US, and Pakistan highlight cultural exchange and impact?', actual_output=None, expected_output="Global Diwali festivities serve as a platform for cultural exchange and highlight the impact of multicultural engagement in several countries. In the UK, Diwali celebrations, such as those at the Swaminarayan Temple in Neasden, are attended by national figures like former Prince Charles, emphasizing the contributions of the Hindu community to British society. The event is marked by lights, decorations, and cultural festivities, fostering mutual appreciation and integration. In the US, Diwali's significance was formally recognized by Congress in 2007, with President Obama participating in White House celebrations in 2009, reflecting recognition and respect for cultural diversity. In Pakistan, the exchange of traditional sweets between Indian and Pakistani border troops during Diwali symbolizes 

In [None]:
goldens[0]

Golden(input='How do global Diwali festivities in the UK, US, and Pakistan highlight cultural exchange and impact?', actual_output=None, expected_output="Global Diwali festivities serve as a platform for cultural exchange and highlight the impact of multicultural engagement in several countries. In the UK, Diwali celebrations, such as those at the Swaminarayan Temple in Neasden, are attended by national figures like former Prince Charles, emphasizing the contributions of the Hindu community to British society. The event is marked by lights, decorations, and cultural festivities, fostering mutual appreciation and integration. In the US, Diwali's significance was formally recognized by Congress in 2007, with President Obama participating in White House celebrations in 2009, reflecting recognition and respect for cultural diversity. In Pakistan, the exchange of traditional sweets between Indian and Pakistani border troops during Diwali symbolizes harmonious gestures that facilitate camara