In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings


evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [None]:
import os
from datasets import load_dataset
from ragas import evaluate, EvaluationDataset


dataset = load_dataset("explodinggradients/ELI5",split="test")
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

In [None]:
eval_dataset[0]

In [None]:
from ragas.metrics import SimpleCriteriaScore, RubricsScore

simple_criteria = SimpleCriteriaScore(
    name="Answer Strength Rating", 
    definition="A metric that evaluates the quality of an answer based on its strengths and weaknesses, rating it as Excellent (4), Acceptable (3), Could be Improved (2), or Bad (1)."
    )

# Attempt 2

using dataset from 
- https://github.com/McGill-NLP/feedbackqa
- https://huggingface.co/datasets/McGill-NLP/feedbackQA (test)

In [3]:
from ragas.config import InstructionConfig,DemonstrationConfig
import pandas as pd
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics import cohen_kappa_score

from ragas.metrics import RubricsScore
from ragas import evaluate

llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


demo_config = DemonstrationConfig(embedding = embeddings)
inst_config = InstructionConfig(llm=llm)

df = pd.read_json("feedback_test.json")[:10]

from ragas.dataset_schema import EvaluationDataset, SingleTurnSample

samples = []
for i, row in df.iterrows():
	sample = SingleTurnSample(
		user_input=row["question"],
		response=row["passage"]["reference"]["section_content"],
	)
	samples.append(sample)


ragas_dataset = EvaluationDataset(samples=samples)

ragas_dataset.to_pandas()

Unnamed: 0,user_input,response
0,What are my options if I can not support mysel...,You should only apply for this visa is you are...
1,I was a working Holiday Maker but lost my job;...,You should only apply for this visa is you are...
2,Is it practical to expect children to practice...,"If your child is sick, they must not go to sch..."
3,Do I have to continue making gym membership pa...,\nMembership ‘freeze’ or ‘holding’ fees may be...
4,Can I keep on working while waiting to see if ...,"In response to the current COVID-19 pandemic, ..."
5,Can I continue to work while I am waiting for ...,The COVID-19 Pandemic event visa can only be g...
6,Can I get a refund for the tickets I bought be...,\nIf you no longer wish to attend an event due...
7,Where and how will the Economic Support Paymen...,If you are in receipt of an eligible income su...
8,what ways can I apply for a temporal activity ...,"Under recently announced measures, New Zealand..."
9,If I am in Australia on temporary visa and hav...,"Temporary visa holders, including bridging vis..."


In [11]:
import json

def read_from_json(path):
	with open(path, 'r') as file:
		data = json.load(file)
		for i in data["answer_strength_rating"]:
			print(i["metric_output"])
    

In [4]:
human_scores = []

category_to_number = {
    'Excellent': 4,
    'Acceptable': 3,
    'Could be Improved': 2,
    'Bad': 1
}

for i, row in df.iterrows():
    score = {
        "human 1": category_to_number[row["rating"][0]],
		"human 2": category_to_number[row["rating"][1]],
		"human 3": category_to_number[row["rating"][2]],
	}
    human_scores.append(score)
    
human_judges = pd.DataFrame(human_scores)
human_judges

Unnamed: 0,human 1,human 2,human 3
0,1,1,1
1,1,1,1
2,1,2,1
3,4,2,4
4,1,2,4
5,1,1,1
6,4,3,4
7,3,4,2
8,1,1,1
9,1,1,1


### Human1 allignment test

In [5]:
h1_answer_strength_rating = RubricsScore(
    name="answer_strength_rating",
    rubrics={
        "4": "Answer is excellent, highly relevant, and fully addresses the question with clarity and detail.",
        "3": "Answer is acceptable, mostly relevant but could be more comprehensive or refined.",
        "2": "Answer could be improved, as it lacks detail, may have minor inaccuracies, or is somewhat unclear.",
        "1": "Answer is bad, irrelevant, misleading, or lacks useful content."
    },
    llm=evaluator_llm,
)

In [6]:
human_judges[["human 1"]]

Unnamed: 0,human 1
0,1
1,1
2,1
3,4
4,1
5,1
6,4
7,3
8,1
9,1


In [7]:
h11_results = evaluate(dataset=ragas_dataset, metrics=[h1_answer_strength_rating])
h11_results.upload()

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/7126d2cc-4bf1-47c4-bb93-36ea681f6e77


'https://app.ragas.io/dashboard/alignment/evaluation/7126d2cc-4bf1-47c4-bb93-36ea681f6e77'

In [12]:
read_from_json("h1_annotated.json")

2
2
2
4
2
1
4
3
1
3


In [8]:
h11_results.to_pandas()["answer_strength_rating"]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [13]:
h1_answer_strength_rating.train(path="h1_annotated.json")

In [14]:
h12_results = evaluate(dataset=ragas_dataset, metrics=[h1_answer_strength_rating])
h12_results.to_pandas()["answer_strength_rating"]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [15]:
original = human_judges["human 1"]
rh11 = h11_results.to_pandas()["answer_strength_rating"]
rh12 = h12_results.to_pandas()["answer_strength_rating"]

r1 = cohen_kappa_score(y1=original, y2=rh11)
r2 = cohen_kappa_score(y1=original, y2=rh12)

In [16]:
print(r1)
print(r2)

0.375
0.375


### Human2 allignment test

In [17]:
h2_answer_strength_rating = RubricsScore(
    name="answer_strength_rating",
    rubrics={
        "4": "Answer is excellent, highly relevant, and fully addresses the question with clarity and detail.",
        "3": "Answer is acceptable, mostly relevant but could be more comprehensive or refined.",
        "2": "Answer could be improved, as it lacks detail, may have minor inaccuracies, or is somewhat unclear.",
        "1": "Answer is bad, irrelevant, misleading, or lacks useful content."
    },
    llm=evaluator_llm,
)

In [18]:
human_judges[["human 2"]]

Unnamed: 0,human 2
0,1
1,1
2,2
3,2
4,2
5,1
6,3
7,4
8,1
9,1


In [19]:
h21_results = evaluate(dataset=ragas_dataset, metrics=[h2_answer_strength_rating])
h21_results.upload()

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/4881a866-2319-4f72-9ef3-e2c84fde3847


'https://app.ragas.io/dashboard/alignment/evaluation/4881a866-2319-4f72-9ef3-e2c84fde3847'

In [20]:
read_from_json("h2_annotated.json")

2
2
2
4
2
1
4
3
1
3


In [21]:
h21_results.to_pandas()["answer_strength_rating"]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [22]:
h2_answer_strength_rating.train(path="h2_annotated.json")

In [23]:
h22_results = evaluate(dataset=ragas_dataset, metrics=[h2_answer_strength_rating])
h22_results.to_pandas()["answer_strength_rating"]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [24]:
original = human_judges["human 2"]
rh21 = h21_results.to_pandas()["answer_strength_rating"]
rh22 = h22_results.to_pandas()["answer_strength_rating"]

r1 = cohen_kappa_score(y1=original, y2=rh21)
r2 = cohen_kappa_score(y1=original, y2=rh22)

In [25]:
print(r1)
print(r2)

0.18918918918918926
0.18918918918918926


### Human3 allignment test

In [26]:
h3_answer_strength_rating = RubricsScore(
    name="answer_strength_rating",
    rubrics={
        "4": "Answer is excellent, highly relevant, and fully addresses the question with clarity and detail.",
        "3": "Answer is acceptable, mostly relevant but could be more comprehensive or refined.",
        "2": "Answer could be improved, as it lacks detail, may have minor inaccuracies, or is somewhat unclear.",
        "1": "Answer is bad, irrelevant, misleading, or lacks useful content."
    },
    llm=evaluator_llm,
)

In [27]:
human_judges[["human 3"]]

Unnamed: 0,human 3
0,1
1,1
2,1
3,4
4,4
5,1
6,4
7,2
8,1
9,1


In [28]:
h31_results = evaluate(dataset=ragas_dataset, metrics=[h3_answer_strength_rating])
h31_results.upload()

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/4e3c4fdd-7ec5-4c76-a590-81d2e1e738e3


'https://app.ragas.io/dashboard/alignment/evaluation/4e3c4fdd-7ec5-4c76-a590-81d2e1e738e3'

In [29]:
read_from_json("h3_annotated.json")

2
2
2
4
2
1
4
3
1
3


In [30]:
h31_results.to_pandas()["answer_strength_rating"]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [31]:
h3_answer_strength_rating.train(path="h3_annotated.json")

In [32]:
h32_results = evaluate(dataset=ragas_dataset, metrics=[h3_answer_strength_rating])
h32_results.to_pandas()["answer_strength_rating"]

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

0    2
1    2
2    2
3    4
4    2
5    1
6    4
7    3
8    1
9    3
Name: answer_strength_rating, dtype: int64

In [33]:
original = human_judges["human 3"]
rh31 = h31_results.to_pandas()["answer_strength_rating"]
rh32 = h32_results.to_pandas()["answer_strength_rating"]

r1 = cohen_kappa_score(y1=original, y2=rh31)
r2 = cohen_kappa_score(y1=original, y2=rh32)

In [34]:
print(r1)
print(r2)

0.23076923076923073
0.23076923076923073
