## LLM
- the idea to experiment the same methods on multiple LLMs to make sure the result are LLM agnostic, ideally. Practically measure the std b/w different models

In [43]:
!pwd

/Users/shahules/ragas/alingment-exp


In [1]:
from langchain_openai.chat_models import ChatOpenAI
from ragas.llms import LangchainLLMWrapper

llm_4o = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
llm_4o_mini = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_aws import ChatBedrockConverse
from langchain_aws import BedrockEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

config = {
    "credentials_profile_name": "default",  # E.g "default"
    "region_name": "us-east-1",  # E.g. "us-east-1"
    "llm": "anthropic.claude-3-haiku-20240307-v1:0",  # E.g "anthropic.claude-3-5-sonnet-20240620-v1:0"
}

bedrock_llm = ChatBedrockConverse(
    credentials_profile_name=config["credentials_profile_name"],
    region_name=config["region_name"],
    base_url=f"https://bedrock-runtime.{config['region_name']}.amazonaws.com",
    model=config["llm"],
)

bedrock_llm = LangchainLLMWrapper(bedrock_llm)

## tracing

In [3]:
import os
os.environ["LANGCHAIN_PROJECT"]= "Alignment"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

## Evaluation

In [7]:
from ragas.metrics._aspect_critic import AspectCriticWithReference
from ragas import EvaluationDataset
from datasets import load_dataset, Dataset
import numpy as np

In [8]:
def prepare_eval_dataset(path,postive_clas=0.6, seed=42):
    samples = []
    np.random.seed(seed)
    dataset = Dataset.from_json(path)
    num = int(len(dataset)*postive_clas)
    positive_sample_indices = np.random.randint(0,50,num).tolist()
    negative_sample_indices = [i for i in range(len(dataset)) if i not in positive_sample_indices]
    for i,row in enumerate(dataset):
        dic = {
                "user_input":row["user_input"],
                "reference": row["reference"],
                                
            }
        if i in positive_sample_indices:
            dic.update(
                {
                    "response": row["response"],
                    "target": 1
                }
            )
        else:
             dic.update(
                {
                    "response": row["errored_response"],
                    "target": 0
                }
            )
        samples.append(dic)
    return Dataset.from_list(samples)
            
            
    

In [9]:
dataset = prepare_eval_dataset("datasets/dataset_v4_training.json",postive_clas=0.5)

Generating train split: 44 examples [00:00, 8897.81 examples/s]


In [10]:
y_true = dataset["target"]

In [11]:
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

## Aspect Critic

In [12]:
def random_examples(n=3):
    from ragas.metrics._aspect_critic import AspectCriticInputWithReference,AspectCriticOutputWithReference
    examples = [(
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="When was the Declaration of Independence signed?",
        response="The Declaration of Independence was signed in 1776.",
        reference="The Declaration of Independence was signed on July 4, 1776.",
        criteria="Is the response factually complete, including both month and year?",
    ),
    AspectCriticOutputWithReference(
        reason="The response provides the correct year but omits the specific date, July 4, making it incomplete.",
        verdict=0,
    ),
),
     (
    AspectCriticInputWithReference(
        user_input="What is the main ingredient in traditional Japanese miso soup?",
        response="The main ingredient in traditional Japanese miso soup is soybeans.",
        reference="The main ingredient in traditional Japanese miso soup is miso paste, which is made from fermented soybeans.",
        criteria="Does the response accurately convey the main ingredient of the soup?",
    ),
    AspectCriticOutputWithReference(
        reason="The response mentions soybeans, which is part of the ingredient but does not fully capture the main ingredient, miso paste.",
        verdict=0,
    ),
)]
    return examples[:n]
     

In [13]:
critic = AspectCriticWithReference(name="answer correctness",
                      definition="Given the user_input, reference and response. Is the response correct compared with the reference",)


In [14]:
critic.single_turn_prompt.examples = random_examples()

In [15]:
critic.llm = llm_4o_mini


In [18]:
from ragas import evaluate

In [19]:
result = evaluate(dataset=dataset,metrics=[critic])

Evaluating: 100%|███████████████████████████████████████| 44/44 [00:04<00:00,  9.11it/s]


In [20]:
df = result.to_pandas()

In [21]:
df["target"] = y_true
df

Unnamed: 0,user_input,response,reference,answer correctness,target
0,What were the key factors and events that led ...,# The Path to Abolishing Slavery: Key Factors ...,The abolition of slavery was a complex process...,0,0
1,What were the key events and impacts of the Ci...,# Key Events and Impacts of the Civil Rights M...,The Civil Rights Movement was a pivotal era in...,1,1
2,How was penicillin discovered and what impact ...,# The Accidental Discovery of Penicillin and I...,Penicillin was discovered by Alexander Fleming...,1,1
3,What were the significant impacts of the Hiros...,# The Profound Impacts of the Hiroshima and Na...,The bombings of Hiroshima and Nagasaki in Augu...,0,0
4,What was the Manhattan Project and what were i...,# The Manhattan Project: A Turning Point in Hi...,The Manhattan Project was a research and devel...,0,0
5,What were the key factors that led to the end ...,# The Demise of Feudalism in Europe: Key Catal...,The end of feudalism in Europe was influenced ...,1,0
6,What were the causes and consequences of the S...,# The Stock Market Crash of 1929: Causes and C...,"The Stock Market Crash of 1929, also known as ...",1,0
7,What were the major causes and impacts of the ...,# The Great Depression: Causes and Impacts\nTh...,"The Great Depression, which began in 1929 and ...",1,1
8,What were the main causes and consequences of ...,# The Arab Spring: Causes and Consequences\nTh...,The Arab Spring was a series of anti-governmen...,0,0
9,What were the main causes and consequences of ...,# The Iranian Revolution of 1980: Causes and C...,The Iranian Revolution of 1979 was primarily c...,0,0


In [22]:
y_pred = df["answer correctness"].values.tolist()

## evaluate the evaluation

In [23]:
from sklearn.metrics import f1_score

In [24]:
f1_score(y_true,y_pred)

0.8095238095238095

## Analysis

In [25]:
def get_confusion_indices(y_true, y_pred):
    """
    Function to find the indices of TP, FP, FN, and TN.

    Parameters:
    y_true (list or np.array): Ground truth binary labels (0 or 1).
    y_pred (list or np.array): Predicted binary labels (0 or 1).

    Returns:
    dict: A dictionary containing lists of indices for TP, FP, FN, and TN.
    """
    tp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 1]
    fp = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 1]
    fn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 1 and pred == 0]
    tn = [i for i, (true, pred) in enumerate(zip(y_true, y_pred)) if true == 0 and pred == 0]

    return {
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'TN': tn
    }

In [26]:
confusion_matrix = get_confusion_indices(y_true,y_pred)

In [27]:
from typing import Any
def serialize_for_json(data: Any) -> Any:
    """
    Convert custom objects into a JSON-serializable format.

    Parameters:
    data (Any): The data to be converted, which may contain custom objects.

    Returns:
    Any: A JSON-serializable version of the input data.
    """
    if isinstance(data, dict):
        return {key: serialize_for_json(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [serialize_for_json(item) for item in data]
    elif hasattr(data, "__dict__"):
        # Convert custom objects by serializing their attributes (assumes they use __dict__)
        return serialize_for_json(data.__dict__)
    else:
        return data  # Return the data as-is if it is already JSON-serializable


In [28]:
from collections import defaultdict
samples = defaultdict(list)
traces = result.traces
for qdrant,indices in confusion_matrix.items():
    for idx in indices:
        samples[qdrant].append(serialize_for_json(traces[idx]['answer correctness']))



    
    
    

In [29]:
[len(val) for val in samples.values()]

[17, 7, 1, 19]

In [30]:
import json
with open("datasets/dataset_v4_training_aspect_critic_annotated.json","w") as file:
    json.dump(samples,file)