## LLM
- the idea to experiment the same methods on multiple LLMs to make sure the result are LLM agnostic, ideally. Practically measure the std b/w different models

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("/Users/shahules/ragas/alingment-exp/")

In [3]:
from tqdm import tqdm

In [4]:
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

llm_4o = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
llm_4o_mini = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain_openai.embeddings import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
langchain_embeddings = LangchainEmbeddingsWrapper(embeddings=embeddings) # any langchain Embeddings instance


In [6]:
from langchain_aws import ChatBedrockConverse
from langchain_aws import BedrockEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

config = {
    "credentials_profile_name": "default",  # E.g "default"
    "region_name": "us-east-1",  # E.g. "us-east-1"
    "llm": "anthropic.claude-3-haiku-20240307-v1:0",  # E.g "anthropic.claude-3-5-sonnet-20240620-v1:0"
}

bedrock_llm = ChatBedrockConverse(
    credentials_profile_name=config["credentials_profile_name"],
    region_name=config["region_name"],
    base_url=f"https://bedrock-runtime.{config['region_name']}.amazonaws.com",
    model=config["llm"],
)

bedrock_llm = LangchainLLMWrapper(bedrock_llm)

## tracing

In [7]:
import os
os.environ["LANGCHAIN_PROJECT"]= "Alignment"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

## Evaluation

In [8]:
from ragas.metrics._aspect_critic import AspectCriticWithReference
from ragas import EvaluationDataset
from datasets import load_dataset, Dataset
import numpy as np

In [9]:
def prepare_eval_dataset(path,postive_clas=0.6, seed=42):
    samples = []
    np.random.seed(seed)
    dataset = Dataset.from_json(path)
    num = int(len(dataset)*postive_clas)
    positive_sample_indices = np.random.randint(0,50,num).tolist()
    negative_sample_indices = [i for i in range(len(dataset)) if i not in positive_sample_indices]
    for i,row in enumerate(dataset):
        dic = {
                "user_input":row["user_input"],
                "reference": row["reference"],
                                
            }
        if i in positive_sample_indices:
            dic.update(
                {
                    "response": row["response"],
                    "target": 1
                }
            )
        else:
             dic.update(
                {
                    "response": row["errored_response"],
                    "target": 0
                }
            )
        samples.append(dic)
    return Dataset.from_list(samples)
            
            
    

In [12]:
# dataset = prepare_eval_dataset("../datasets/dataset_v4.json",postive_clas=0.6)

Generating train split: 50 examples [00:00, 4161.51 examples/s]


In [13]:
# dataset.to_json("../datasets/dataset_v4_test.json")

Creating json from Arrow format: 100%|███████████████████| 1/1 [00:00<00:00, 140.19ba/s]


163988

In [14]:
len(dataset)

50

In [15]:
y_true = dataset["target"]

In [1]:
dataset = Dataset.from_json("datasets/dataset_eli5.json")

NameError: name 'Dataset' is not defined

In [12]:
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

In [13]:
eval_dataset

EvaluationDataset(features=['user_input', 'response', 'reference'], len=10)

## dataset modification

In [6]:
from datasets import load_dataset, Dataset
import json

In [46]:
original_dataset = Dataset.from_json("../datasets/dataset_eli5.json")
dataset = load_dataset("explodinggradients/ELI5")


In [47]:
original_dataset

Dataset({
    features: ['user_input', 'reference', 'term', 'response', 'errored_response', 'error_description'],
    num_rows: 156
})

In [48]:
for split in dataset:
    samples = []
    val = len(dataset[split]) * [None]
    dataset[split] = dataset[split].add_column("error_description",val)
    for row in dataset[split]:
        user_input = row['user_input']
        error_description = None
        if row['target'] == 0:
            for sample in original_dataset:
                if sample['user_input'] == user_input:
                    print('here')
                    error_description = sample["error_description"]
        
        row["error_description"] = error_description
        samples.append(row)
    Dataset.from_list(samples).to_json(f"../datasets/ELI5_{split}")

        

here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


Creating json from Arrow format: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 157.70ba/s]


here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here
here


Creating json from Arrow format: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 584.16ba/s]


In [49]:
train = Dataset.from_json('../datasets/ELI5_test')

Generating train split: 100 examples [00:00, 22083.42 examples/s]


In [63]:
train.filter(lambda x: x['target'] == 0)[19]

{'user_input': 'What is the Theory of Mutation Rates and how does it impact evolutionary biology?',
 'reference': 'The Theory of Mutation Rates explores how often mutations occur in a given genome over a specific period. It is crucial in evolutionary biology because it helps explain genetic variation within populations. Mutations can introduce new traits, some of which may offer a survival advantage or disadvantage, influencing natural selection. Understanding mutation rates allows scientists to predict evolutionary trends, study genetic diseases, and explore the mechanisms of adaptation. The theory also aids in estimating the timeframes of evolutionary events by comparing genetic differences between species.',
 'response': "The Theory of Mutation Rates is like counting how often little changes happen in the DNA of living things over time. These changes are important because they can make new traits, like superpowers, that help or don't help animals and plants survive. By knowing how o

## Aspect Critic

In [17]:
def random_examples(n=3):
    from ragas.metrics._aspect_critic import AspectCriticInputWithReference,AspectCriticOutputWithReference
    examples = [(
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="When was the Declaration of Independence signed?",
        response="The Declaration of Independence was signed in 1776.",
        reference="The Declaration of Independence was signed on July 4, 1776.",
        criteria="Is the response factually complete, including both month and year?",
    ),
    AspectCriticOutputWithReference(
        reason="The response provides the correct year but omits the specific date, July 4, making it incomplete.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
)]
    return examples[:n]
     

In [19]:
# new_prompt = "Please evaluate the provided responses based on their accuracy and relevance to the user inputs and references. Assign a verdict of 1 for responses that align closely with the reference, even if they contain some additional context or elaboration, and 0 for incorrect responses that contain factual inaccuracies or significant deviations from the reference. Ensure that all critical details from the reference, including specific phrases or terms essential for accuracy, are included in the response to avoid missing key information that could affect the verdict. Provide a brief reason for your verdict, highlighting both strengths and weaknesses in the responses, and ensure that the evaluation remains concise, focusing on core factual accuracy and relevance to the reference. For guidance, examples of significant deviations include discrepancies in key dates or misinterpretations of critical facts"

In [14]:
new_prompt = "Given the user_input, reference and response. Is the response correct compared with the reference"

In [15]:
critic = AspectCriticWithReference(name="answer correctness",
                      definition=new_prompt,dynamic_retrieval=False)


In [16]:
critic.single_turn_prompt.examples = []

In [17]:
critic.llm = llm_4o_mini
critic.embedding_model = langchain_embeddings

In [18]:
from ragas import evaluate

In [19]:
result = evaluate(dataset=eval_dataset[:],metrics=[critic])

Evaluating: 100%|███████████████████████████████████████| 10/10 [00:02<00:00,  3.48it/s]


In [20]:
result.upload()

Evaluation results uploaded! View at https://app.ragas.io/alignment/evaluation/3b57bd77-aed2-4699-90ca-955dfbaad49f


'https://app.ragas.io/alignment/evaluation/3b57bd77-aed2-4699-90ca-955dfbaad49f'

In [26]:
df = result.to_pandas()

In [27]:
df["target"] = y_true
df.head()

Unnamed: 0,user_input,response,reference,answer correctness,target
0,How did the invention of the wheel impact anci...,# The Revolutionary Impact of the Wheel on Anc...,The invention of the wheel was a pivotal momen...,1,0
1,How did the discovery of fire impact early hum...,# The Transformative Power of Fire in Early Hu...,The discovery of fire was a pivotal moment in ...,1,1
2,What were the major impacts of the Agricultura...,# The Transformative Impacts of the Agricultur...,"The Agricultural Revolution, which began aroun...",1,1
3,What were the key events and consequences of t...,# The Fall of Constantinople: A Turning Point ...,"The Fall of Constantinople occurred on May 29,...",0,0
4,How did the birth of democracy in Athens shape...,# The Birth of Democracy in Athens: A Revoluti...,"The birth of democracy in Athens, around the 5...",1,0


In [28]:
y_pred = df["answer correctness"].values.tolist()

## evaluate the evaluation

In [29]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [30]:
print("f1",f1_score(y_true,y_pred))
print("precision",precision_score(y_true,y_pred))
print("recall",recall_score(y_true,y_pred))

f1 0.7096774193548387
precision 0.55
recall 1.0


## json serializaition

In [63]:
from typing import Any
def serialize_for_json(data: Any) -> Any:
    """
    Convert custom objects into a JSON-serializable format.

    Parameters:
    data (Any): The data to be converted, which may contain custom objects.

    Returns:
    Any: A JSON-serializable version of the input data.
    """
    if isinstance(data, dict):
        return {key: serialize_for_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [serialize_for_json(item) for item in data]
    elif hasattr(data, "__dict__"):
        # Convert custom objects by serializing their attributes (assumes they use __dict__)
        return serialize_for_json(data.__dict__)
    else:
        return data  # Return the data as-is if it is already JSON-serializable


def format_dict(dic):
    string = ""
    for key, val in dic.items():
        string += f"\n{key}:\n\t{val}\n"
    return string


## Feedback samples

In [37]:
wrong_predictions = [i for i in range(len(dataset)) if y_pred[i]!=y_true[i]]
traces = result.traces

In [59]:
np.random.seed(42)
subset_indices= np.random.choice(wrong_predictions,4)

In [60]:
subset_indices

array([17, 44, 30, 25])

In [61]:
traces_for_wrong = [traces[i] for i in subset_indices]

[17, 30, 25]

In [131]:
new_prompt

'Please evaluate the provided responses based on their accuracy and relevance to the user inputs and references. Assign a verdict of 1 for responses that align closely with the reference, even if they contain some additional context or elaboration, and 0 for incorrect responses that contain factual inaccuracies or significant deviations from the reference. Ensure that all critical details from the reference, including specific phrases or terms essential for accuracy, are included in the response to avoid missing key information that could affect the verdict. Provide a brief reason for your verdict, highlighting both strengths and weaknesses in the responses, and ensure that the evaluation remains concise, focusing on core factual accuracy and relevance to the reference. For guidance, examples of significant deviations include discrepancies in key dates or misinterpretations of critical facts'

In [151]:
import typing as t
from pydantic import BaseModel
from ragas.prompt import PydanticPrompt

class SingleInputOutput(BaseModel):
    input: str
    incorrect_output: t.Dict[str, t.Any]
    
class ErrorAnalysisInput(BaseModel):
    prompt: str
    examples: t.List[SingleInputOutput]
    
class ErrorAnalysisOutput(BaseModel):
    reason: str
    
class ErrorAnalysisPrompt(PydanticPrompt[ErrorAnalysisInput, ErrorAnalysisOutput]):
    name: str = "error_analysis"
    instruction: str = (
        "You're an expert reviewer. Given a prompt and (input  containing (user_input, response, reference), output) example, one of the expert annotaters in your team marked the output as incorrect."
        "Analyse input, output pair to understand why the ouput was marked as incorrect by the annotator"
    )
    input_model = ErrorAnalysisInput
    output_model = ErrorAnalysisOutput

In [178]:
class PatternAnalysisInput(BaseModel):
    reasons: t.List[str]

class PatternAnalysisOutput(BaseModel):
    output: t.List[str]


class PatternAnalysisPrompt(PydanticPrompt[PatternAnalysisInput, PatternAnalysisOutput]):
    instruction = (
        "Given a list of reasons for specific example for which a prompt yeilded incorrect output"
        "do pattern analyse on the provided reasons and provide a general error statement"
    )
    input_model = PatternAnalysisInput
    output_model = PatternAnalysisOutput

In [179]:
def conver_to_single_input(trace):
    data = serialize_for_json(trace['answer correctness'])['0_single_turn_aspect_critic_prompt_with_reference']
    input_ = data["input"]
    input_ = format_dict(input_)
    output = data["output"][0]
    return SingleInputOutput(input=input_,  incorrect_output=output)
                
    
    

In [175]:
reasons = []
for idx in [25,30]:
    error_analysis_input = [conver_to_single_input(traces[idx])]
    prompt_input = ErrorAnalysisInput(prompt=new_prompt,examples=error_analysis_input)
    prompt = ErrorAnalysisPrompt()
    response = await prompt.generate(data=prompt_input,llm=llm_4o_mini)
    reasons.append(response.reason)

In [180]:
prompt = PatternAnalysisPrompt()
prompt_input = PatternAnalysisInput(reasons=reasons)
await prompt.generate(data=prompt_input,llm=llm_4o_mini)

PatternAnalysisOutput(output=['The output was marked as incorrect due to critical factual inaccuracies in the response, including incorrect dates and deviations from the required focus. These inaccuracies led to a failure in accurately reflecting the specific details from the reference material, resulting in a lower evaluation score.'])

In [165]:
response

ErrorAnalysisOutput(reason="The output was marked as incorrect because it failed to accurately reflect the specific details from the reference. While the response provided a comprehensive overview of the factors leading to the end of Apartheid, it incorrectly stated the year of the Soweto Uprising as 1975 instead of 1976, which is a critical factual inaccuracy. Additionally, the response included excessive elaboration that, while informative, deviated from the concise nature required by the prompt. The inclusion of the phrase 'the end of the Cold War' without specifying its relevance to the context of Apartheid also contributed to the deviation from the reference.")

In [160]:
response

ErrorAnalysisOutput(reason="The output was marked as incorrect because, despite the response providing a comprehensive overview of the impact of Gutenberg's printing press, it contains inaccuracies regarding the timeline. The reference states that the printing press was invented in the mid-15th century, while the response claims it was invented in the early 15th century. This discrepancy in key dates is a significant deviation from the reference, which affects the overall factual accuracy of the response.")

In [161]:
new_prompt

'Please evaluate the provided responses based on their accuracy and relevance to the user inputs and references. Assign a verdict of 1 for responses that align closely with the reference, even if they contain some additional context or elaboration, and 0 for incorrect responses that contain factual inaccuracies or significant deviations from the reference. Ensure that all critical details from the reference, including specific phrases or terms essential for accuracy, are included in the response to avoid missing key information that could affect the verdict. Provide a brief reason for your verdict, highlighting both strengths and weaknesses in the responses, and ensure that the evaluation remains concise, focusing on core factual accuracy and relevance to the reference. For guidance, examples of significant deviations include discrepancies in key dates or misinterpretations of critical facts'

## Analysis

In [8]:
def get_confusion_indices(y_true, y_pred):
    """
    Function to find the indices of TP, FP, FN, and TN.

    Parameters:
    y_true (list or np.array): Ground truth binary labels (0 or 1).
    y_pred (list or np.array): Predicted binary labels (0 or 1).

    Returns:
    dict: A dictionary containing lists of indices for TP, FP, FN, and TN.
    """
    tp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 1]
    fp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 1]
    fn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 0]
    tn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 0]

    return {
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'TN': tn
    }

In [26]:
confusion_matrix = get_confusion_indices(y_true,y_pred)

In [28]:
from collections import defaultdict
samples = defaultdict(list)
traces = result.traces
for qdrant,indices in confusion_matrix.items():
    for idx in indices:
        samples[qdrant].append(serialize_for_json(traces[idx]['answer correctness']))

In [29]:
[len(val) for val in samples.values()]

[17, 7, 1, 19]

In [30]:
import json
with open("datasets/dataset_v4_training_aspect_critic_annotated.json","w") as file:
    json.dump(samples,file)

## Embed and store samples

In [23]:
import json
import numpy as np
samples = json.load(open("../datasets/dataset_v4_training_aspect_critic_annotated.json"))
                     
# embeddings.aembed_documents()

In [24]:
def organise_training_samples(samples):

    tp = samples["TP"]
    tn = samples["TN"]
    fp = samples['FP']
    fn = samples['FN']
    all_samples = tp + tn + fp + fn
    final_samples = []
    for sample in all_samples:
        if sample in tp:
            qdrant = 'TP'
            target = 1
        elif sample in fp:
            qdrant = 'FP'
            target = 0
        elif sample in tn:
            qdrant = 'TN'
            target = 0
        else:
            qdrant = 'FN'
            target = 1
        sample = list(sample.values())[0]
        sample['qdrant'] = qdrant
        sample['target'] = target
        _ = sample['input'].pop('criteria')
        sample['output'] = sample['output'][0]
        final_samples.append(sample)

    return final_samples

In [25]:
samples = organise_training_samples(samples)

In [29]:
from datasets import Dataset
Dataset.from_list(samples).to_json("../datasets/dataset_v4_training_annotated_full.json")

Creating json from Arrow format: 100%|██| 1/1 [00:00<00:00, 89.98ba/s]


153319

In [30]:
Dataset.from_json("../datasets/dataset_v4_training_annotated_full.json")

Generating train split: 44 examples [00:00, 1448.89 examples/s]


Dataset({
    features: ['input', 'output', 'qdrant', 'target'],
    num_rows: 44
})

In [37]:
def embed(num_samples=20,seed=42):
    tp = samples["TP"]
    tn = samples["TN"]
    fp = samples['FP']
    fn = samples['FN']
    all_samples = tp + tn + fp + fn
    final_samples = []
    for sample in all_samples:
        if sample in tp:
            qdrant = 'TP'
            target = 1
        elif sample in fp:
            qdrant = 'FP'
            target = 0
        elif sample in tn:
            qdrant = 'TN'
            target = 0
        else:
            qdrant = 'FN'
            target = 1
        sample = list(sample.values())[0]
        sample['qdrant'] = qdrant
        sample['target'] = target
        final_samples.append(sample)

    np.random.seed(seed)
    num_samples = min(num_samples,len(all_samples))
    selected_samples = list(np.random.choice(final_samples,size=num_samples))
    embed_points = [
        "\n".join(list(data['input'].values()))
        for data in selected_samples
    ]
    vectors = embeddings.embed_documents(embed_points)
    return selected_samples,np.array(vectors)

In [364]:
def json_save(data,path):
    with open(path,"w") as file:
        json.dump(data,file)

In [39]:
vector_samples, vectors = embed(num_samples=20)


In [135]:
from collections import Counter
Counter([sample['qdrant'] for sample in vector_samples]).most_common()

[('TN', 9), ('TP', 6), ('FP', 4), ('FN', 1)]

In [41]:
np.save('indices/selected_20_indices_input_resp_ref.npy', vectors)
json_save(vector_samples,'indices/selected_20_indices_input_resp_ref.json')

In [11]:
data = json.load(open("indices/selected_20_indices.json"))

In [5]:
def add_target(data):
    new_data = []
    for item in data:
        if item['qdrant'] in ['TP','FN']:
            item['target'] = 1
        elif item['qdrant'] in ['TN','FP']:
            item['target'] = 0
        new_data.append(item)
    return new_data

data = add_target(data)
        
    

### Test retrival

In [31]:
from ragas.metrics._aspect_critic import AspectCriticWithReference
critic = AspectCriticWithReference(name="answer correctness",
                      definition="Given the user_input, reference and response. Is the response correct compared with the reference",)

th

In [32]:
critic.embedding_model = langchain_embeddings
critic.llm = llm_4o_mini

In [33]:
from ragas import SingleTurnSample
dataset = Dataset.from_json("datasets/dataset_v4.json")
eval_sample = SingleTurnSample(**dataset[0])
eval_sample.user_input = "What were the causes and consequences of the Chernobyl Disaster?"

In [34]:
await critic.single_turn_ascore(eval_sample)

0

## prompt optimisation

In [96]:
from sampling import SingleExample,PromptFromCorrectExample
from ragas import evaluate

In [97]:
from ragas.metrics._aspect_critic import AspectCriticInputWithReference, AspectCriticOutputWithReference


In [98]:
def format_dict(dic):
    string = ""
    for key, val in dic.items():
        string += f"\n{key}:\n\t{val}\n"
    return string

async def reverse_engineer_instruction_from_correct_examples(data,llm,num_instructions=5, num_samples=10,seed=42):
    """
    reverse engineer prompts using correct example
    """
    generated_prompts = []
    
    np.random.seed(seed=seed)
    correct_examples = [sample for sample in data if sample['qdrant'] in ['TN','TP']]

    prompt = PromptFromCorrectExample()
    for i in tqdm(range(num_instructions)):
        input_list, output_list = [],[]
        
        examples  = np.random.choice(correct_examples,size=3)
        curr_examples = []
        for example in examples:
            _ = example['input'].pop('criteria') if 'criteria' in example['input'] else None
            curr_examples.append((format_dict(example["input"]),example['output'][0]))
        prompt_input = SingleExample(examples=curr_examples)
        response = await prompt.generate(data=prompt_input,llm=llm)
        generated_prompts.append(response.instruction)

    return generated_prompts


    

In [99]:
candidate_prompts=await reverse_engineer_instruction_from_correct_examples(data,llm_4o_mini,num_samples=5)

100%|███████████████████████| 5/5 [00:12<00:00,  2.53s/it]


In [114]:
from datasets import Dataset
import random
from collections import defaultdict

def stratified_sample_with_fallback(dataset: Dataset, batch_size: int, target_column: str = 'target') -> Dataset:
    """
    Function to sample a batch of data with as equal distribution of target values as possible.
    
    Parameters:
        dataset (Dataset): The Hugging Face dataset to sample from.
        batch_size (int): The number of samples in the output batch.
        target_column (str): The column name containing the target values.
        
    Returns:
        Dataset: A batch of data with as close to equal distribution of target values as possible.
    """
    np.random.seed(seed=42)
    # Get all unique target values and count samples per target
    target_values = dataset.unique(target_column)
    target_counts = {target: 0 for target in target_values}
    for example in dataset:
        target_counts[example[target_column]] += 1
    
    # Calculate initial number of samples per class
    num_classes = len(target_values)
    samples_per_class = batch_size // num_classes
    remaining_slots = batch_size

    # Track sampled indices
    sampled_indices = []
    
    # Sampling loop with fallback for insufficient samples
    for target in target_values:
        available_indices = [idx for idx, example in enumerate(dataset) if example[target_column] == target]
        num_to_sample = min(samples_per_class, len(available_indices))
        
        # Sample available indices
        sampled_indices.extend(random.sample(available_indices, num_to_sample))
        remaining_slots -= num_to_sample
    
    # Handle remaining slots by resampling from classes with available samples
    while remaining_slots > 0:
        for target in target_values:
            if remaining_slots <= 0:
                break
            available_indices = [idx for idx, example in enumerate(dataset) if example[target_column] == target]
            if available_indices:
                # Sample one additional index from available samples
                sampled_indices.append(random.choice(available_indices))
                remaining_slots -= 1

    # Shuffle sampled indices to ensure randomness
    random.shuffle(sampled_indices)
    return dataset.select(sampled_indices)

In [165]:
def score_prompts(dataset,candidate_prompts,batch_size,metric_kwargs={}):
    result_scores = []
    for prompt in candidate_prompts:
        critic = AspectCriticWithReference(name="answer correctness",
                          definition=prompt,**metric_kwargs)
        dataset = stratified_sample_with_fallback(dataset,batch_size,target_column='qdrant')
        y_true = dataset['target']
        eval_dataset = [data['input'] for data in dataset]
        eval_dataset = EvaluationDataset.from_list(eval_dataset)
        result = evaluate(dataset=eval_dataset,metrics=[critic])
        traces = result.traces
        df = result.to_pandas()
        y_pred = df["answer correctness"].values.tolist()
        incorrect_indices = [i for i in range(len(y_pred)) if y_pred[i]!=y_true[i]]
        feedback_samples = []
        for idx in incorrect_indices:
            if dataset[idx]['qdrant'] in ['TP','TN']:
                dic ={
                    "input":dataset[idx]["input"],
                    "incorrect_output":serialize_for_json(traces[idx]['answer correctness']),
                    "expected_output":dataset[idx]["output"][0]
                }
                dic['incorrect_output'] = dic['incorrect_output']['0_single_turn_aspect_critic_prompt_with_reference']['output'][0]
                feedback_samples.append(dic)
        fscore = f1_score(y_true,y_pred)
        result_scores.append({"prompt":prompt,"score":fscore,"feedback":feedback_samples})
    return result_scores
        

In [168]:
training_dataset = Dataset.from_list(data)
metric_kwargs = {
    "llm":llm_4o_mini,
    "embedding_model":embeddings,
    "dynamic_retrevial": False
}
prompt_scores = score_prompts(training_dataset,candidate_prompts[:],10,metric_kwargs)

Evaluating: 100%|█████████| 10/10 [00:03<00:00,  2.90it/s]
Flattening the indices: 100%|█| 10/10 [00:00<00:00, 2198.8
Evaluating: 100%|█████████| 10/10 [00:02<00:00,  4.09it/s]
Flattening the indices: 100%|█| 10/10 [00:00<00:00, 2354.5
Evaluating: 100%|█████████| 10/10 [00:03<00:00,  2.57it/s]
Flattening the indices: 100%|█| 10/10 [00:00<00:00, 2256.7
Evaluating: 100%|█████████| 10/10 [00:04<00:00,  2.42it/s]
Flattening the indices: 100%|█| 10/10 [00:00<00:00, 1755.3
Evaluating: 100%|█████████| 10/10 [00:03<00:00,  2.90it/s]


In [236]:
from sampling import FeedbackExample, FeedbackMutationInput, FeedbackMutationPrompt

In [237]:
async def get_feedback_for_prompt(prompt,llm):
    mutation_prompt = FeedbackMutationPrompt()
    original_prompt = prompt['prompt']
    examples = []
    for item in prompt['feedback']:
        input_ = item['input']
        if 'criteria' in input_:
            _ = input_.pop("criteria")
        input_ = format_dict(input_)
        prompt_input = FeedbackExample(input=input_,output=item["incorrect_output"],expected_output=item["expected_output"])
        examples.append(prompt_input)
    prompt_input = FeedbackMutationInput(prompt=original_prompt,examples=examples)
    response = await mutation_prompt.generate(data=prompt_input,llm=llm)
    return response.feedbacks
        

In [238]:
feedbacks = await get_feedback_for_prompt(prompt_scores[0],llm_4o_mini)

In [239]:
feedbacks

['Clarify the criteria for assigning a verdict of 1 or 0, ensuring that the prompt explicitly states that a verdict of 1 should be given for responses that align closely with the reference, even if they contain some additional context or elaboration.',
 'Emphasize the importance of including all critical details from the reference in the response, such as specific phrases or terms that are essential for accuracy, to avoid missing key information that could affect the verdict.',
 'Encourage a more concise evaluation of responses, suggesting that while additional context can be informative, it should not detract from the core factual accuracy and relevance to the reference.']

In [240]:
from sampling import FeedbackMutationPromptGeneration,FeedbackMutationPromptInput

In [241]:
async def generate_prompt_from_feedback(prompt,feedbacks,llm):
    improvement_prompt = FeedbackMutationPromptGeneration()
    prompt_input = FeedbackMutationPromptInput(prompt=prompt,feedbacks=feedbacks)
    response = await improvement_prompt.generate(data=prompt_input,llm=llm)
    return response.instruction

In [242]:
new_prompt = await generate_prompt_from_feedback(prompt_scores[0]['prompt'],feedbacks,llm_4o_mini)

In [247]:
prompt_scores = score_prompts(training_dataset,[new_prompt],10,metric_kwargs)

Evaluating: 100%|█████████| 10/10 [00:04<00:00,  2.18it/s]


In [249]:
prompt_scores

[{'prompt': 'Please evaluate the provided responses based on their accuracy and relevance to the user inputs and references. Assign a verdict of 1 for responses that align closely with the reference, even if they contain some additional context or elaboration, and 0 for incorrect responses that contain factual inaccuracies or significant deviations from the reference. Ensure that all critical details from the reference, including specific phrases or terms essential for accuracy, are included in the response to avoid missing key information that could affect the verdict. Provide a brief reason for your verdict, highlighting both strengths and weaknesses in the responses, and ensure that the evaluation remains concise, focusing on core factual accuracy and relevance to the reference. For guidance, examples of significant deviations include discrepancies in key dates or misinterpretations of critical facts.',
  'score': 0.7272727272727273,
  'feedback': []}]

In [1]:
import json

In [3]:
data = json.load(open("../indices/selected_20_indices_input_resp_ref.json"))

In [6]:
add_target(data)

[{'input': {'user_input': 'What were the main causes and outcomes of the Wars of the Roses?',
   'response': '# The Wars of the Roses: Causes and Consequences\nThe Wars of the Roses, a tumultuous period in English history, were marked by fierce battles and political maneuvering. Here’s a closer look at the causes and outcomes of this dynastic struggle:\n\n## Causes of the Wars\n- **Weak Leadership:** The reign of King Henry VI was marred by his inability to assert control over the nobility. His weak leadership created a power vacuum, leading to intense rivalries among the English aristocracy.\n- **Economic and Social Strife:** England was grappling with economic difficulties and social unrest, which fueled discontent and made the country ripe for conflict. These issues exacerbated the tensions between the rival factions.\n- **Dynastic Rivalry:** The core of the conflict lay in the rivalry between the houses of Lancaster and York, both of which had claims to the English throne. This dyn

In [4]:
data[0]

{'input': {'user_input': 'What were the main causes and outcomes of the Wars of the Roses?',
  'response': '# The Wars of the Roses: Causes and Consequences\nThe Wars of the Roses, a tumultuous period in English history, were marked by fierce battles and political maneuvering. Here’s a closer look at the causes and outcomes of this dynastic struggle:\n\n## Causes of the Wars\n- **Weak Leadership:** The reign of King Henry VI was marred by his inability to assert control over the nobility. His weak leadership created a power vacuum, leading to intense rivalries among the English aristocracy.\n- **Economic and Social Strife:** England was grappling with economic difficulties and social unrest, which fueled discontent and made the country ripe for conflict. These issues exacerbated the tensions between the rival factions.\n- **Dynastic Rivalry:** The core of the conflict lay in the rivalry between the houses of Lancaster and York, both of which had claims to the English throne. This dynas