In [2]:
# Imports
import os
import json
from langchain_core.documents import Document

#!pip install ragas

# Generation using RAGAS TestGenerator

In [3]:
os.environ["OPENAI_API_KEY"] = "To Fill In"

In [7]:
# Load documents
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\backend\mockdata\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

43


In [9]:
for doc in documents:
    doc.metadata['file_name'] = doc.metadata['source']

In [13]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

Filename and doc_id are the same for all nodes.                 
Generating: 100%|██████████| 2/2 [05:35<00:00, 167.95s/it]


In [16]:
panda_test = testset.to_pandas()

In [18]:
panda_test.to_csv('testset.csv')

# Generating of test dataset (Customised)

## RAGAS

In [21]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_correctness, context_recall, context_precision

In [22]:
os.environ["OPENAI_API_KEY"] = "To Fill In"

In [26]:
data_sample = {
    'question': [
        'How have you been Roydon?'
    ],
    'answer': [
        "Response 1: I have been good, how about you? Response 2: I've been doing well thanks for asking. Response 3: Not too bad how about you?"
    ],
    'contexts': [
        ["""{'Roydon": "Hey there! Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Roydon! Yeah, it's always exciting to see how your team will perform. Optimistic as always, I see!},
         {"Roydon": "I can't wait to immerse myself in everything Japan has to offer and create lasting memories that will overshadow my Thailand trip.", "Yas": "Your positive outlook will surely make this trip one for the books! Japan is lucky to have you as a visitor."},
         {"Roydon": "Guess what, I just got a new pet dog!", "Jacob": "That's awesome! What breed is it?"}"""]
    ],
    'ground_truth': [
        "Response 1: I've been watching Arsenal games hoping they will win. Response 2: I've been looking at a trip to Japan. Response 3: I just got a new pet dog. How about you?"
    ]
}

dataset = Dataset.from_dict(data_sample)

score = evaluate(dataset, metrics=[faithfulness, answer_relevancy, answer_correctness])

Evaluating: 100%|██████████| 3/3 [00:07<00:00,  2.65s/it]


In [27]:
df = score.to_pandas()
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,answer_correctness
0,How have you been Roydon?,"Response 1: I have been good, how about you? R...","[{'Roydon"": ""Hey there! Can't wait for the new...",Response 1: I've been watching Arsenal games h...,0.0,0.0,0.583383


# Deep-Eval

In [29]:
!pip install pytest
!pip install deepeval

Collecting pytest
  Downloading pytest-8.3.2-py3-none-any.whl (341 kB)
                                              0.0/341.8 kB ? eta -:--:--
     ---                                   30.7/341.8 kB 660.6 kB/s eta 0:00:01
     ---                                   30.7/341.8 kB 660.6 kB/s eta 0:00:01
     -------                               71.7/341.8 kB 563.7 kB/s eta 0:00:01
     -------                               71.7/341.8 kB 563.7 kB/s eta 0:00:01
     -------                               71.7/341.8 kB 563.7 kB/s eta 0:00:01
     ---------------------                204.8/341.8 kB 778.2 kB/s eta 0:00:01
     --------------------------           256.0/341.8 kB 749.3 kB/s eta 0:00:01
     ------------------------------       286.7/341.8 kB 803.7 kB/s eta 0:00:01
     ------------------------------       286.7/341.8 kB 803.7 kB/s eta 0:00:01
     ------------------------------------ 341.8/341.8 kB 731.9 kB/s eta 0:00:00
Collecting iniconfig (from pytest)
  Downloading iniconf


[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting deepeval
  Downloading deepeval-1.0.6-py3-none-any.whl (302 kB)
                                              0.0/302.4 kB ? eta -:--:--
     -----                                    41.0/302.4 kB ? eta -:--:--
     ---------------                        122.9/302.4 kB 1.8 MB/s eta 0:00:01
     ---------------                        122.9/302.4 kB 1.8 MB/s eta 0:00:01
     ---------------                        122.9/302.4 kB 1.8 MB/s eta 0:00:01
     --------------------------------       256.0/302.4 kB 1.2 MB/s eta 0:00:01
     --------------------------------       256.0/302.4 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 302.4/302.4 kB 1.0 MB/s eta 0:00:00
Collecting typer (from deepeval)
  Downloading typer-0.12.4-py3-none-any.whl (47 kB)
                                              0.0/47.4 kB ? eta -:--:--
     ---------------------------------------- 47.4/47.4 kB 2.3 MB/s eta 0:00:00
Collecting rich (from deepeval)
  Using cached rich-13.7.1-py


[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import pytest
from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

ModuleNotFoundError: No module named 'pytest'