<a href="https://colab.research.google.com/github/sarunsmenon/llm/blob/main/LLM_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [1]:
! pip install -q deepeval python-dotenv openai langchain-openai langchain langchain_community faiss-cpu  uuid7 langgraph chromadb aiobotocore

# Load Libraries

In [2]:
from google.colab import userdata
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import chromadb
import random

In [3]:
from deepeval.synthesizer import Synthesizer
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase

In [4]:
import requests
import chromadb
from bs4 import BeautifulSoup
import openai
import hashlib
import time
import re

# Load Variables

In [5]:
os.environ['OPENAI_API_KEY'] = userdata.get('open_ai_key')

In [6]:
llm_model = 'gpt-3.5-turbo-1106'
llm = ChatOpenAI(model=llm_model, temperature=0)

In [7]:
# Start crawling from the initial URL
ignore_lst = []
max_pg_lmt = 5000
fldr = '/content/drive/MyDrive/GenAI - Hack/mhdb'

# Load Contexts

## Load Chroma Collection

In [8]:
chroma_client = chromadb.PersistentClient(path=fldr)
collection = chroma_client.get_or_create_collection(name="semantic")

## Load Contexts

In [9]:
all_contexts = collection.get(include=["documents"])["documents"]

In [None]:
collection.get(include=["documents"])

In [11]:
len(all_contexts)

12331

In [12]:
sample_context = random.sample(all_contexts, 10)

In [13]:
len(sample_context)

10

In [14]:
sample_context_lst = []
for context in sample_context:
  sample_context_lst.append([context])

# Synthesizer

In [39]:
len(sample_context_lst[0])

1

In [25]:
synthesizer = Synthesizer(model=llm_model)
test_cases = synthesizer.generate_goldens_from_contexts(
    # Provide a list of context for synthetic data generation
    contexts=sample_context_lst,
    max_goldens_per_context = 2
)

Output()

In [34]:
synth_df = synthesizer.to_pandas()
synth_df

Unnamed: 0,input,actual_output,expected_output,context,retrieval_context,n_chunks_per_context,context_length,evolutions,context_quality,synthetic_input_quality,source_file
0,Identify the tags associated with the Reportin...,,The tags associated with the Reporting Framewo...,[Publications\n\nReporting Framework Summary R...,,1,333,[In-Breadth],,1.0,
1,When was the last update on The Reporting Fram...,,The last update on the Reporting Framework Sum...,[Publications\n\nReporting Framework Summary R...,,1,333,[Hypothetical],,1.0,
2,Identify the tags associated with the Reportin...,,The tags associated with the Reporting Framewo...,[Publications\n\nReporting Framework Summary R...,,1,333,[In-Breadth],,1.0,
3,When was the last update on The Reporting Fram...,,The last update on the Reporting Framework Sum...,[Publications\n\nReporting Framework Summary R...,,1,333,[Hypothetical],,1.0,
4,Examine the characteristics and symptoms of bi...,,The bipolar mood disorder is a mental health c...,[• Cos’è il disturbo bipolare dell’umore?],,1,40,[In-Breadth],,1.0,
...,...,...,...,...,...,...,...,...,...,...,...
145,How might MHV further enhance the Strategy thr...,,MHV can further enhance the Strategy through e...,[6\n\ntargeting these communities. Equitable r...,,1,4923,[Hypothetical],,0.6,
146,On what day will the event be held to ensure m...,,"The event will be held on a Sunday, which is t...",[Or a Sunday.],,1,12,[Multi-context],,0.6,
147,How would people's schedules and leisure activ...,,People's schedules and leisure activities woul...,[Or a Sunday.],,1,12,[Hypothetical],,1.0,
148,Explore the housing status of individuals usin...,,The publication explores the housing circumsta...,[APA\n\nDetail\n\nCite\n\nCitation\n\nPrint\n\...,,1,1872,[In-Breadth],,1.0,


# Evaluation

In [27]:
synthesizer.synthetic_goldens[0].input

'Identify the tags associated with the Reporting Framework Summary Report - Adelaide.'

In [28]:
synthesizer.synthetic_goldens[0]

Golden(input='Identify the tags associated with the Reporting Framework Summary Report - Adelaide.', actual_output=None, expected_output='The tags associated with the Reporting Framework Summary Report - Adelaide are "Suicide prevention."', context=['Publications\n\nReporting Framework Summary Report - Adelaide\n\nScroll down to access downloads and media. Download [Publication] Reporting Framework Summary Report - Adelaide (PDF) as PDF - 853.52 KB - 2 pages\n\nPublication type:\n\nFact Sheet\n\nPublication date:\n\n7 November 2017\n\nLast updated:\n\n31 October 2019\n\nTags:\n\nSuicide prevention'], retrieval_context=None, additional_metadata={'evolutions': ['In-Breadth'], 'synthetic_input_quality': 1.0}, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None)

In [43]:

# Prepare evaluation metrics
metrics_lst = [
    AnswerRelevancyMetric(threshold=0.7),
    # FaithfulnessMetric(threshold=0.7)
]

# Create test cases from goldens
test_case = []
for golden in synthesizer.synthetic_goldens:
    res = llm.invoke(golden.input).content
    test_case = LLMTestCase(input=golden.input, actual_output=res, expected_output=golden.expected_output)
    # print(test_case)
    test_cases.append(test_case)
    # evaluate(test_cases=[test_case], metrics=metrics_lst)



# # Evaluate end-to-end
# # evaluate(test_cases=test_cases, metrics=[AnswerRelevancyMetric()])



In [44]:
clean_test_case_lst = []
for test_case in test_cases:
  if isinstance(test_case, LLMTestCase):
    clean_test_case_lst.append(test_case)


In [46]:
print(len(test_cases))
print(len(clean_test_case_lst))

230
210


In [50]:
random_test_case_lst = random.sample(clean_test_case_lst, 30)
all_results = []
batch_size = 5 # Adjust this number to control concurrency


metrics_lst = [
    AnswerRelevancyMetric(threshold=0.7, verbose_mode=False), # Set verbose_mode to False
    # FaithfulnessMetric(threshold=0.7, verbose_mode=False) # Do the same for other metrics
]

for i in range(0, len(random_test_case_lst), batch_size):
    batch = random_test_case_lst[i:i + batch_size]
    print(f"Evaluating batch {i // batch_size + 1}...")
    results = evaluate(test_cases=batch, metrics=metrics_lst)

    all_results.extend(results)

print("Evaluation complete for all batches.")

Evaluating batch 1...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the input directly without any irrelevant statements. Great job staying focused and concise!, error: None)

For test case:

  - input: When was the last update on The Reporting Framework Summary Report - Adelaide?
  - actual output: I'm sorry, but I don't have access to the specific information about the last update on The Reporting Framework Summary Report - Adelaide. You may need to check the official website or contact the relevant organization for the most recent update.
  - expected output: The last update on the Reporting Framework Summary Report - Adelaide was on October 31, 2019.
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation mode

Evaluating batch 2...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the input directly without including any irrelevant information. Great job staying focused and on-topic!, error: None)

For test case:

  - input: How would people's schedules and leisure activities change if Saturdays did not exist?
  - actual output: If Saturdays did not exist, people's schedules and leisure activities would likely shift to accommodate the loss of a weekend day. 

- Work schedules may be adjusted to include an additional day off during the week, such as a mid-week break. This could potentially lead to a more balanced work-life schedule for some individuals.

- Leisure activities that are typically reserved for Saturdays, such as weekend getaways, outdoor adventures, and social gatherings, may be rescheduled to other days of the week. This could lead to a more spread out distribut

Evaluating batch 3...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the input directly without any irrelevant statements. Great job staying focused and concise!, error: None)

For test case:

  - input: Identify the tags associated with the Reporting Framework Summary Report - Adelaide.
  - actual output: - Reporting
- Framework
- Summary Report
- Adelaide
  - expected output: The tags associated with the Reporting Framework Summary Report - Adelaide are "Suicide prevention."
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the input directly without any irrelevant information. Great job staying focused and concise

Evaluating batch 4...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the question directly without any irrelevant information. Great job staying focused and helpful!, error: None)

For test case:

  - input: Where can I find someone to talk to about my mental health in healthcare services available in Victoria?
  - actual output: You can find someone to talk to about your mental health in healthcare services available in Victoria by contacting the following resources:

1. Your General Practitioner (GP): Your GP can provide initial assessment and support for mental health concerns and can refer you to a mental health professional if needed.

2. Mental Health Services Helpline: You can call the Mental Health Services Helpline at 1300 651 251 for information and support regarding mental health services in Victoria.

3. Headspace: Headspace provides mental health suppor

Evaluating batch 5...


Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the question directly with no irrelevant information. Great job staying focused and concise!, error: None)

For test case:

  - input: When was the last update on The Reporting Framework Summary Report - Adelaide?
  - actual output: I'm sorry, but I don't have access to the specific information about the last update on The Reporting Framework Summary Report - Adelaide. You may need to check the official website or contact the relevant organization for the most recent information.
  - expected output: The last update on the Reporting Framework Summary Report - Adelaide was on October 31, 2019.
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 80.00% pass rate




Metrics Summary

  - ❌ Answer Relevancy (score: 0.6153846153846154, threshold: 0.7, strict: Fal

Evaluating batch 6...


Output()

ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 30000, Requested 1202. Please try again in 2.404s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1 in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 28843, Requested 1202. Please try again in 90ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 2 time(s)...




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and addressed the question directly, with no irrelevant statements. Great job staying focused and helpful!, error: None)

For test case:

  - input: Where can I find someone to talk to about my mental health in healthcare services available in Victoria?
  - actual output: You can find someone to talk to about your mental health in healthcare services available in Victoria by contacting the following resources:

1. Your General Practitioner (GP): Your GP can provide initial assessment and support for mental health concerns and can refer you to a mental health professional if needed.

2. Mental Health Services Helpline: You can call the Mental Health Services Helpline at 1300 651 251 for information and support regarding mental health services in Victoria.

3. Headspace: Headspace provides mental health support fo

Evaluation complete for all batches.


In [59]:
all_results[0][1][0]

TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and addressed the input directly without any irrelevant statements. Great job staying focused and concise!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.003956, verbose_logs='Statements:\n[\n    "I don\'t have access to the specific information about the last update on The Reporting Framework Summary Report - Adelaide.",\n    "You may need to check the official website for the most recent update.",\n    "You may need to contact the relevant organization for the most recent update."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='When was 

# Testing

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric()
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Bison",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)

Output()

0.0


In [None]:

from deepeval.synthesizer import Synthesizer
from deepeval.dataset import EvaluationDataset

synthesizer = Synthesizer()
goldens = synthesizer.generate_goldens_from_docs(document_paths=['/content/Diwali - Wikipedia.pdf'])

dataset = EvaluationDataset(goldens=goldens)

✨ 🚀 ✨ Loading Documents: 100%|██████████| 1/1 [00:09<00:00,  9.82s/it]
✨ 📚 ✨ Chunking Documents: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
✨ 🧩 ✨ Generating Contexts:   0%|          | 0/9 [00:00<?, ?it/s]
✨ 🧩 ✨ Generating Contexts:  56%|█████▌    | 5/9 [00:01<00:01,  3.90it/s]
  ✨ 🫗 ✨ Filling Contexts:  17%|█▋        | 1/6 [00:02<00:11,  2.39s/it][A
  ✨ 🫗 ✨ Filling Contexts:  50%|█████     | 3/6 [00:02<00:02,  1.31it/s][A
  ✨ 🫗 ✨ Filling Contexts:  83%|████████▎ | 5/6 [00:03<00:00,  1.77it/s][A
✨ 🧩 ✨ Generating Contexts: 100%|██████████| 9/9 [00:03<00:00,  2.54it/s]


✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-embedding-3-small, method=docs):  17%|█▋        | 1/6 [00:08<00:42,  8.48s/it]ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 28947, Requested 2940. Please try again in 3.774s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
ERROR:root:OpenAI Error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-VxwfSaIZ7gLnAsbaBUs3WgIR on tokens per min (TPM): Limit 30000, Used 28683, Requested 3198. Please try again in 3.762s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}} Retrying: 1 time(s)...
✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-