See `Dataset for Metric Alignment` on Notes 

## Set LLM to use

In [1]:
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

## Metric - Factual correctness


In [2]:
from ragas.prompt import PydanticPrompt
from pydantic import BaseModel

class CreateQuestion(BaseModel):
    term: str
    answer_length: str  # Expected to be one of ["long (20-50 words)", "short (1-5 words)", "medium (10-20 words)"]

class QuestionAnswer(BaseModel):
    question: str
    answer: str

class QuestionAnswerPrompt(PydanticPrompt[CreateQuestion, QuestionAnswer]):
    instruction: str = "Given a seed term, generate a question and provide an answer that fits the specified answer_length."
    input_model = CreateQuestion
    output_model = QuestionAnswer
    examples = [
        (
            CreateQuestion(term="gravity", answer_length="short (1-5 words)"),
            QuestionAnswer(
                question="What is gravity?",
                answer="A force of attraction."
            )
        ),
        (
            CreateQuestion(term="thermodynamics", answer_length="long (20-50 words)"),
            QuestionAnswer(
                question="What are the laws of thermodynamics?",
                answer="The laws of thermodynamics include the zeroth, first, second, and third laws, which describe the principles of temperature, energy conservation, entropy, and absolute zero."
            )
        ),
        (
            CreateQuestion(term="entropy", answer_length="medium (10-20 words)"),
            QuestionAnswer(
                question="What is entropy?",
                answer="Entropy measures the disorder or randomness of a system in thermodynamics."
            )
        )
    ]

In [3]:
class AnswerComparisonInput(BaseModel):
    answer: str  # The original answer given
    comparison_type: str  # Expected to be one of ["full match", "partial match", "full mismatch"]

class GeneratedComparisonResponse(BaseModel):
    response: str  # The new response based on the comparison type

class ComparisonResponsePrompt(PydanticPrompt[AnswerComparisonInput, GeneratedComparisonResponse]):
    instruction: str = "Given an answer and comparison type, generate a new response that either fully matches, partially matches, or fully mismatches the given answer."
    input_model = AnswerComparisonInput
    output_model = GeneratedComparisonResponse
    examples = [
        (
            AnswerComparisonInput(
                answer="Gravity is a force of attraction between masses.",
                comparison_type="full match"
            ),
            GeneratedComparisonResponse(
                response="Gravity is a force that pulls masses toward each other."
            )
        ),
        (
            AnswerComparisonInput(
                answer="Entropy measures disorder in a system.",
                comparison_type="partial match"
            ),
            GeneratedComparisonResponse(
                response="Entropy is related to disorder, but it also involves energy in systems."
            )
        ),
        (
            AnswerComparisonInput(
                answer="Thermodynamics describes the relationships between heat and energy.",
                comparison_type="full mismatch"
            ),
            GeneratedComparisonResponse(
                response="Thermodynamics explains the movement of planets in space."
            )
        )
    ]

In [4]:
prompt = QuestionAnswerPrompt()
prompt_input = CreateQuestion(term="Entropy", answer_length="long (20-50 words)")
output = await prompt.generate(data=prompt_input, llm=llm)

In [5]:
output.question, output.answer

('What is entropy in the context of thermodynamics?',
 "Entropy is a measure of the disorder or randomness in a system, reflecting the number of microscopic configurations that correspond to a thermodynamic system's macroscopic state. It is a central concept in the second law of thermodynamics, which states that the total entropy of an isolated system can never decrease over time.")

In [6]:
prompt = ComparisonResponsePrompt()
prompt_input = AnswerComparisonInput(answer=output.answer,comparison_type="partial match")
output = await prompt.generate(data=prompt_input, llm=llm)

In [7]:
output

GeneratedComparisonResponse(response='Entropy is often associated with disorder in a system, and it plays a key role in the second law of thermodynamics, which suggests that entropy tends to increase in isolated systems.')

In [8]:
# Define your percentages for each answer type
answer_type_dict = {
    "long (20-50 words)": 0.4,    # 40%
    "short (1-5 words)": 0.3,     # 30%
    "medium (10-20 words)": 0.3   # 30%
}
match_type_dict ={
    "full match": 0.5,    # 50%
    "partial match": 0.3, # 30%
    "full mismatch": 0.2  # 20%
}


In [9]:
def load_terms():
    # Path to the file
    file_path = 'physics_terms.txt'
    
    # Reading the file into a list
    with open(file_path, 'r') as file:
        terms_list = [line.strip() for line in file.readlines()]

    return terms_list

In [10]:
len(load_terms())

130

In [11]:
import numpy as np

In [12]:

samples = []
for term in load_terms():
    sample = {}    
    answer_type = np.random.choice(list(answer_type_dict.keys()),1,p=list(answer_type_dict.values()))[0]
    match_type = np.random.choice(list(match_type_dict.keys()),1,p=list(match_type_dict.values()))[0]
    prompt = QuestionAnswerPrompt()
    prompt_input = CreateQuestion(term=term, answer_length=answer_type)
    output = await prompt.generate(data=prompt_input, llm=llm)
    sample.update(
        {
            "user_input":output.question,
            "reference":output.answer,
            "term":term,
            "answer_type":answer_type
        }
    )

    prompt = ComparisonResponsePrompt()
    prompt_input = AnswerComparisonInput(answer=output.answer,comparison_type=match_type)
    output = await prompt.generate(data=prompt_input, llm=llm)
    sample.update({
        "response":output.response,
        "match_type":match_type
    })

    samples.append(sample)
    
        

In [13]:
from datasets import Dataset
dataset = Dataset.from_list(samples)

In [14]:
dataset.push_to_hub("explodinggradients/physics_metrics_alignment")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/638 [00:00<?, ?B/s]

In [16]:
dataset

Dataset({
    features: ['user_input', 'reference', 'term', 'answer_type', 'response', 'match_type'],
    num_rows: 130
})

In [20]:
dataset[0]

{'user_input': 'How does gravity affect objects on Earth?',
 'reference': "Gravity pulls objects toward the Earth's center, giving them weight and influencing their motion.",
 'term': 'gravity',
 'answer_type': 'medium (10-20 words)',
 'response': "Gravity causes objects to float away from the Earth's surface, reducing their weight and having no effect on their motion.",
 'match_type': 'full mismatch'}