## LLM
- the idea to experiment the same methods on multiple LLMs to make sure the result are LLM agnostic, ideally. Practically measure the std b/w different models

In [1]:
%load_ext autoreload
%autoreload 2

In [45]:
from tqdm import tqdm

In [2]:
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

llm_4o = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
llm_4o_mini = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_openai.embeddings import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper


embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
langchain_embeddings = LangchainEmbeddingsWrapper(embeddings=embeddings) # any langchain Embeddings instance


In [4]:
from langchain_aws import ChatBedrockConverse
from langchain_aws import BedrockEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

config = {
    "credentials_profile_name": "default",  # E.g "default"
    "region_name": "us-east-1",  # E.g. "us-east-1"
    "llm": "anthropic.claude-3-haiku-20240307-v1:0",  # E.g "anthropic.claude-3-5-sonnet-20240620-v1:0"
}

bedrock_llm = ChatBedrockConverse(
    credentials_profile_name=config["credentials_profile_name"],
    region_name=config["region_name"],
    base_url=f"https://bedrock-runtime.{config['region_name']}.amazonaws.com",
    model=config["llm"],
)

bedrock_llm = LangchainLLMWrapper(bedrock_llm)

## tracing

In [5]:
import os
os.environ["LANGCHAIN_PROJECT"]= "Alignment"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

## Evaluation

In [6]:
from ragas.metrics._aspect_critic import AspectCriticWithReference
from ragas import EvaluationDataset
from datasets import load_dataset, Dataset
import numpy as np

In [7]:
def prepare_eval_dataset(path,postive_clas=0.6, seed=42):
    samples = []
    np.random.seed(seed)
    dataset = Dataset.from_json(path)
    num = int(len(dataset)*postive_clas)
    positive_sample_indices = np.random.randint(0,50,num).tolist()
    negative_sample_indices = [i for i in range(len(dataset)) if i not in positive_sample_indices]
    for i,row in enumerate(dataset):
        dic = {
                "user_input":row["user_input"],
                "reference": row["reference"],
                                
            }
        if i in positive_sample_indices:
            dic.update(
                {
                    "response": row["response"],
                    "target": 1
                }
            )
        else:
             dic.update(
                {
                    "response": row["errored_response"],
                    "target": 0
                }
            )
        samples.append(dic)
    return Dataset.from_list(samples)
            
            
    

In [8]:
dataset = prepare_eval_dataset("datasets/dataset_v4.json",postive_clas=0.6)

In [9]:
len(dataset)

50

In [10]:
y_true = dataset["target"]

In [11]:
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

## Aspect Critic

In [12]:
def random_examples(n=3):
    from ragas.metrics._aspect_critic import AspectCriticInputWithReference,AspectCriticOutputWithReference
    examples = [(
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="When was the Declaration of Independence signed?",
        response="The Declaration of Independence was signed in 1776.",
        reference="The Declaration of Independence was signed on July 4, 1776.",
        criteria="Is the response factually complete, including both month and year?",
    ),
    AspectCriticOutputWithReference(
        reason="The response provides the correct year but omits the specific date, July 4, making it incomplete.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
)]
    return examples[:n]
     

In [13]:
critic = AspectCriticWithReference(name="answer correctness",
                      definition="Given the user_input, reference and response. Is the response correct compared with the reference",)


In [14]:
critic.single_turn_prompt.examples = []

In [23]:
critic.llm = llm_4o
critic.embedding_model = langchain_embeddings

In [24]:
from ragas import evaluate

In [25]:
result = evaluate(dataset=dataset,metrics=[critic])

Evaluating: 100%|█████████| 50/50 [00:08<00:00,  6.03it/s]


In [26]:
df = result.to_pandas()

In [27]:
df["target"] = y_true
df.head()

Unnamed: 0,user_input,response,reference,answer correctness,target
0,How did the invention of the wheel impact anci...,# The Revolutionary Impact of the Wheel on Anc...,The invention of the wheel was a pivotal momen...,1,0
1,How did the discovery of fire impact early hum...,# The Transformative Power of Fire in Early Hu...,The discovery of fire was a pivotal moment in ...,1,1
2,What were the major impacts of the Agricultura...,# The Transformative Impacts of the Agricultur...,"The Agricultural Revolution, which began aroun...",1,1
3,What were the key events and consequences of t...,# The Fall of Constantinople: A Turning Point ...,"The Fall of Constantinople occurred on May 29,...",1,0
4,How did the birth of democracy in Athens shape...,# The Birth of Democracy in Athens: A Revoluti...,"The birth of democracy in Athens, around the 5...",1,0


In [28]:
y_pred = df["answer correctness"].values.tolist()

## evaluate the evaluation

In [29]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [30]:
print("f1",f1_score(y_true,y_pred))
print("precision",precision_score(y_true,y_pred))
print("recall",recall_score(y_true,y_pred))

f1 0.7096774193548387
precision 0.55
recall 1.0


## Analysis

In [8]:
def get_confusion_indices(y_true, y_pred):
    """
    Function to find the indices of TP, FP, FN, and TN.

    Parameters:
    y_true (list or np.array): Ground truth binary labels (0 or 1).
    y_pred (list or np.array): Predicted binary labels (0 or 1).

    Returns:
    dict: A dictionary containing lists of indices for TP, FP, FN, and TN.
    """
    tp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 1]
    fp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 1]
    fn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 0]
    tn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 0]

    return {
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'TN': tn
    }

In [26]:
confusion_matrix = get_confusion_indices(y_true,y_pred)

In [27]:
from typing import Any
def serialize_for_json(data: Any) -> Any:
    """
    Convert custom objects into a JSON-serializable format.

    Parameters:
    data (Any): The data to be converted, which may contain custom objects.

    Returns:
    Any: A JSON-serializable version of the input data.
    """
    if isinstance(data, dict):
        return {key: serialize_for_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [serialize_for_json(item) for item in data]
    elif hasattr(data, "__dict__"):
        # Convert custom objects by serializing their attributes (assumes they use __dict__)
        return serialize_for_json(data.__dict__)
    else:
        return data  # Return the data as-is if it is already JSON-serializable


In [28]:
from collections import defaultdict
samples = defaultdict(list)
traces = result.traces
for qdrant,indices in confusion_matrix.items():
    for idx in indices:
        samples[qdrant].append(serialize_for_json(traces[idx]['answer correctness']))

In [29]:
[len(val) for val in samples.values()]

[17, 7, 1, 19]

In [30]:
import json
with open("datasets/dataset_v4_training_aspect_critic_annotated.json","w") as file:
    json.dump(samples,file)

## Embed and store samples

In [34]:
import json
import numpy as np
samples = json.load(open("datasets/dataset_v4_training_aspect_critic_annotated.json"))
                     
# embeddings.aembed_documents()

In [37]:
def embed(num_samples=20,seed=42):
    tp = samples["TP"]
    tn = samples["TN"]
    fp = samples['FP']
    fn = samples['FN']
    all_samples = tp + tn + fp + fn
    final_samples = []
    for sample in all_samples:
        if sample in tp:
            qdrant = 'TP'
        elif sample in fp:
            qdrant = 'FP'
        elif sample in tn:
            qdrant = 'TN'
        else:
            qdrant = 'FN'
        sample = list(sample.values())[0]
        sample['qdrant'] = qdrant
        final_samples.append(sample)

    np.random.seed(seed)
    selected_samples = list(np.random.choice(final_samples,size=num_samples))
    embed_points = [
        "\n".join(list(data['input'].values()))
        for data in selected_samples
    ]
    vectors = embeddings.embed_documents(embed_points)
    return selected_samples,np.array(vectors)

In [38]:
def json_save(data,path):
    with open(path,"w") as file:
        json.dump(data,file)

In [39]:
vector_samples, vectors = embed(num_samples=20)


In [135]:
from collections import Counter
Counter([sample['qdrant'] for sample in vector_samples]).most_common()

[('TN', 9), ('TP', 6), ('FP', 4), ('FN', 1)]

In [41]:
np.save('indices/selected_20_indices_input_resp_ref.npy', vectors)
json_save(vector_samples,'indices/selected_20_indices_input_resp_ref.json')

In [35]:
data = json.load(open("indices/selected_20_indices.json"))

### Test retrival

In [31]:
from ragas.metrics._aspect_critic import AspectCriticWithReference
critic = AspectCriticWithReference(name="answer correctness",
                      definition="Given the user_input, reference and response. Is the response correct compared with the reference",)

th

In [32]:
critic.embedding_model = langchain_embeddings
critic.llm = llm_4o_mini

In [33]:
from ragas import SingleTurnSample
dataset = Dataset.from_json("datasets/dataset_v4.json")
eval_sample = SingleTurnSample(**dataset[0])
eval_sample.user_input = "What were the causes and consequences of the Chernobyl Disaster?"

In [34]:
await critic.single_turn_ascore(eval_sample)

0

## prompt optimisation

[{'reason': 'The response accurately outlines the main causes and outcomes of the Wars of the Roses, aligning well with the details provided in the reference.',
  'verdict': 1}]

In [101]:
from sampling import SingleExample,CorrectIncorrectExample,PromptFromCorrectIncorrectExamples,PromptFromCorrectExample

In [102]:
from ragas.metrics._aspect_critic import AspectCriticInputWithReference, AspectCriticOutputWithReference


In [162]:
async def reverse_engineer_instruction_from_correct_examples(data,llm,num_instructions=5, num_samples=10,seed=42):
    """
    reverse engineer prompts using correct example
    """
    generated_prompts = []
    
    np.random.seed(seed=seed)
    correct_examples = [sample for sample in data if sample['qdrant'] in ['TN','TP']]

    prompt = PromptFromCorrectExample()
    for i in tqdm(range(num_instructions)):
        input_list, output_list = [],[]
        
        examples  = np.random.choice(correct_examples,size=3)
        for example in examples:
            _ = example['input'].pop('criteria') if 'criteria' in example['input'] else None
            input_list.append(example['input'])
            output_list.append(example['output'][0])
        prompt_input = SingleExample(inputs=input_list,outputs=output_list)
        response = await prompt.generate(data=prompt_input,llm=llm)
        generated_prompts.append(response.instruction)

    return generated_prompts


async def reverse_engineer_instruction_from_all_examples(data,llm,num_instructions=5, num_samples=10,seed=42):
    """
    reverse engineer prompts using correct and incorrect examples
    """

    def prepare_examples(examples):
        input_list, output_list = [],[]
        examples  = np.random.choice(examples,size=2)
        for example in examples:
            _ = example['input'].pop('criteria') if 'criteria' in example['input'] else None
            input_list.append(example['input'])
            output_list.append(example['output'][0])
        prompt_input = SingleExample(inputs=input_list,outputs=output_list)
        return prompt_input


    generated_prompts = []
    
    np.random.seed(seed=seed)
    correct_examples = [sample for sample in data if sample['qdrant'] in ['TN','TP']]
    incorrect_examples = [sample for sample in data if sample['qdrant'] in ['FP','FN']]
    prompt = PromptFromCorrectIncorrectExamples()
    for i in tqdm(range(num_instructions)):
        correct_input = prepare_examples(correct_examples)
        incorrect_input = prepare_examples(incorrect_examples)
        # return correct_input,incorrect_input
        prompt_input = CorrectIncorrectExample(correct_examples=correct_input,incorrect_examples=incorrect_input)
        response = await prompt.generate(data=prompt_input,llm=llm)
        generated_prompts.append(response.instruction)

    return generated_prompts

In [157]:
output=await reverse_engineer_instruction_from_correct_examples(data,llm_4o_mini,num_samples=5)

100%|███████████████████████| 5/5 [00:13<00:00,  2.67s/it]


In [158]:
output

['Provide a detailed explanation of the historical significance of the Domesday Book, including its commission by William the Conqueror, insights into 11th-century England, and its legacy as a crucial historical document. Additionally, outline the main causes and consequences of the Arab Spring, focusing on the role of authoritarian regimes, political corruption, human rights violations, economic decline, and the impact of social media, as well as the varied outcomes in different countries. Finally, discuss the causes and consequences of the Iranian Revolution of 1979, emphasizing dissatisfaction with the Shah, economic struggles, cultural disconnect, and the leadership of Ayatollah Khomeini, along with the establishment of the Islamic Republic and its impact on Middle Eastern geopolitics.',
 'Evaluate the accuracy and completeness of the responses provided to the user inputs, comparing them against the reference texts. Assign a verdict of 1 for correct and complete responses, and 0 fo

In [163]:
output=await reverse_engineer_instruction_from_all_examples(data,llm_4o_mini,num_samples=5)

100%|███████████████████████| 5/5 [00:08<00:00,  1.74s/it]


In [164]:
output

['Provide a detailed explanation of the causes and consequences of a historical event, ensuring that the response aligns closely with a given reference while maintaining factual accuracy. Avoid including excessive elaboration or interpretative content that may lead to discrepancies with the reference.',
 "Provide a detailed explanation of historical events, including their causes and consequences, while ensuring factual accuracy and relevance to the user's query. Responses should align closely with provided references, avoiding unnecessary elaboration unless it enhances understanding.",
 'Provide a detailed explanation of the causes and outcomes of historical events, ensuring that the response aligns closely with the reference material while maintaining clarity and coherence.',
 'Provide a detailed explanation of historical events, including their significance, causes, and outcomes, while ensuring accuracy and alignment with provided references. Avoid unnecessary elaboration or metapho