<h1><center>ARES Evaluation Strategies</h1></center>

<h2>IDP + UES</h2>
<p>Uses targeted prompts to enable pre-trained models to assess content relevance and accuracy in a zero-shot manner.</p>

In [None]:
import sys
sys.path.append("/future/u/manihani/ARES")  # Add the directory containing the ARES package to sys.path

from ares.ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice" : "gpt-3.5-turbo-1106"
}

# Optional: Provide an alternative model of your choice below.
# Here are some models you can choose from:
# - mistralai/Mistral-7B-Instruct-v0.2
# - mistralai/Mixtral-8x7B-Instruct-v0.1
# - gpt-4-turbo-preview
# - microsoft/deberta-v3-large
# - openlm-research/open_llama_7b_v2
# - mosaicml/mpt-7b-instruct

In [2]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

{'Context Relevance Scores': 0.25, 'Answer Faithfulness Scores': 0.005, 'Answer Relevance Scores': 0.004}


<h2>Training Classifier + UES</h2>

In [None]:
import sys
sys.path.append("..") 

from ares.ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "./datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice": "gpt-3.5-turbo-1106"
}

classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", #UES File
    "label_column": "Context_Relevance_Label", 
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

In [3]:
ares = ARES(classifier_model=classifier_config)
results = ares.train_classifier()
print(results)

# Accuracy for Test Set: 0.8137082601054482

Accuracy for Test Set: 0.8137082601054482


<h2>Training Classifier + PPI + UES</h2>

<h3>UES</h3>

In [None]:
from ares.ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice" : "gpt-3.5-turbo-1106"
}

In [None]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

<h3>Training Classifier</h3>

<p>Generates checkpoint which is used in PPI below</p>

In [None]:
from ARES.ares.ares import ARES

classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", #UES File
    "label_column": "Context_Relevance_Label", 
    "model_choice": "microsoft/deberta-v3-large",
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

In [None]:
ares = ARES(classifier_model=classifier_config)
results = ares.train_classifier()
print(results)

<h3>PPI</h3>

In [6]:
import sys
sys.path.append("/future/u/manihani/ARES")  # Add the directory containing the ARES package to sys.path

from ares.ares import ARES

ppi_config = { 
    "evaluation_datasets": ['../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv'], 
    "few_shot_examples_filepath": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "checkpoints": ["../checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt"],
    "labels": ["Context_Relevance_Label"], 
    "GPT_scoring": False, 
    "gold_label_path": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    "swap_human_labels_for_gpt4_labels": False
}

In [10]:
ares = ARES(ppi=ppi_config)
results = ares.evaluate_RAG()
print(results)

--------------------------------------------------------
Evaluation Sets: ['../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv']
Checkpoints: ['../checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt']
Labels: ['Context_Relevance_Label']
GPT Scoring: False
--------------------------------------------------------
few_shot_examples
4
                                                                           Query  \
0           On what day did the event happen that the most people took pride it?   
1         With how many of the children does Christina commit a sexual act with?   
2                  How can a ball that is not moving possess energy of position?   
3  Before he murder the doctor and Ralph Smith, where did the stepfather reside?   

                                                                          



DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-large', vocab_size=128000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Loading the Best Finetuned-LLM Checkpoint
This is the model, tokenizer,

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


  0%|          | 0/4421 [00:00<?, ?it/s]

Gathering ML predictions for Y_labeled_dataset in PPI!


Map:   0%|          | 0/153 [00:00<?, ? examples/s]

  0%|          | 0/153 [00:00<?, ?it/s]

Y_labeled, Yhat_labeled, Yhat_unlabeled for ../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv - Context_Relevance_Label
153
153
4421
Y_labeled_dataset Label Distribution: 
95
58
Y_labeled_dataset Prediction Distribution: 
87
66
Yhat_unlabeled_dataset Prediction Distribution: 
2491
1930


  0%|          | 0/20 [00:00<?, ?it/s]

  rechat = (Yhat_labeled - Y_labeled).mean()
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ci_classical[j,i,:] = binomial_iid(n,alpha,y.mean())


--------------------------------------------------
Context_Relevance_Label Scoring
ARES Ranking
[0]
Avg. PPIs: [0.6157347655943952]
PPI Confidence Intervals: [[0.5422749567760038, 0.6891945744127868]]
Evaluation Set Lengths: [4421]
Evaluation Set Ratio: [0.6]
Test Accuracy Scores: [0.7733544446957702]
Y-Labeled Example Count: 153
--------------------------------------------------

None


<h2>ARES Comparison to RAGAS and Zeroshot Mistral</h2>

<h3>ARES Configuration</h3>

<p>Synthetic Generator</p>

In [None]:
synth_config = { 
    "document_filepath": "datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv",
    "few_shot_prompt_filename": "datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "synthetic_queries_filename": "output/synthetic_queries_1.tsv",
    "model_choice": "google/flan-t5-xxl", # Default Model Choice, provide alternative model as you wish. 
    "documents_sampled": 6381
}

ares = ARES(synthetic_query_generator=synth_config)
results = ares.generate_synthetic_data()
print(results)

<p>Training Classifier</p>

In [None]:
classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "/future/u/manihani/ARES/datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    "label_column": "Answer_Relevance_Label", 
    "model_choice": "microsoft/deberta-v3-large", # Default Model Choice, provide alternative model as you wish. 
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

ares = ARES(classifier_model=classifier_config)
results = ares.train_classifier()
print(results)

<p>PPI</p>

In [None]:
ppi_config = { 
    "evaluation_datasets": ['/future/u/manihani/ARES/datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv'], 
    "few_shot_examples_filepath": "/future/u/manihani/ARES/datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "checkpoints": ["/future/u/manihani/ARES/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt"],
    "labels": ["Context_Relevance_Label"], 
    "model_choice": "microsoft/deberta-v3-large" # Default Model Choice, provide alternative model as you wish. 
    "GPT_scoring": False, 
    "gold_label_path": "/future/u/manihani/ARES/datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", 
    "swap_human_labels_for_gpt4_labels": False
}

<h3>RAGAS Configuration</h3>

<p>Data Cleaning | Context Relevance Label Filter</p>

In [10]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

def load_and_prepare_dataset(file_path):
    # Load the dataset from the TSV file
    dataset_df = pd.read_csv(file_path, delimiter='\t')
    
    # Convert 'Context_Relevance_Label' to string if it is not already
    dataset_df['Context_Relevance_Label'] = dataset_df['Context_Relevance_Label'].astype(str)
    
    # Use 'Context_Relevance_Label' as 'ground_truth'
    prepared_data = {
        'question': dataset_df['Query'].tolist(),
        'contexts': [[doc] for doc in dataset_df['Document'].tolist()],  # Contexts are expected to be list of lists
        'answer': dataset_df['Answer'].tolist(),
        'ground_truth': dataset_df['Context_Relevance_Label'].tolist(),  # Using 'Context_Relevance_Label' as 'ground_truth'
    }
    
    # Convert to HuggingFace's Dataset format
    dataset = Dataset.from_dict(prepared_data)
    return dataset


<p>Context Relevance Accuracy</p>

In [12]:
from ragas import evaluate
from ragas.metrics import faithfulness, context_recall, context_precision
import os

file_path = "./datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv"  # Update this path
prepared_dataset = load_and_prepare_dataset(file_path)

# Specify metrics
metrics = [
    # Add or remove metrics based on your evaluation plan
    context_precision,
    context_recall,
]

# Evaluate
result = evaluate(prepared_dataset, metrics=[context_precision, context_recall])  # Update metrics as needed

# Assuming 'result' can be explored or exported as needed
print(result)

Evaluating:   0%|          | 0/12378 [00:00<?, ?it/s]

{'context_precision': 0.4424, 'context_recall': 0.4830}


<p>Data Cleaning | Answer Faithfulness Label Filter</p>

In [16]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

def load_and_prepare_dataset(file_path):
    # Load the dataset from the TSV file
    dataset_df = pd.read_csv(file_path, delimiter='\t')
    
    # Convert 'Context_Relevance_Label' to string if it is not already
    dataset_df['Answer_Faithfulness_Label'] = dataset_df['Answer_Faithfulness_Label'].astype(str)
    
    # Use 'Context_Relevance_Label' as 'ground_truth'
    prepared_data = {
        'question': dataset_df['Query'].tolist(),
        'contexts': [[doc] for doc in dataset_df['Document'].tolist()], 
        'answer': dataset_df['Answer'].tolist(),
        'ground_truth': dataset_df['Answer_Faithfulness_Label'].tolist(), 
    }
    
    # Convert to HuggingFace's Dataset format
    dataset = Dataset.from_dict(prepared_data)
    return dataset


In [17]:
from ragas import evaluate
from ragas.metrics import faithfulness, context_recall, context_precision
import os

file_path = "./datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv" 
prepared_dataset = load_and_prepare_dataset(file_path)

# Specify metrics
metrics = [
    faithfulness
]

# Evaluate
result = evaluate(prepared_dataset, metrics=[faithfulness]) 

print(result)

Evaluating:   0%|          | 0/6189 [00:00<?, ?it/s]

{'faithfulness': 0.4478}


<h3>Zeroshot Mistral Configuration</h3>

In [1]:
import sys
import os
sys.path.append("/future/u/manihani/ARES")  # Add the directory containing the ARES package to sys.path

from ares.ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Model: Mistral 7B
    "model_choice" : "codellama/CodeLlama-13b-Instruct-hf"
}



In [2]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
{'Context Relevance Scores': 0.94, 'Answer Faithfulness Scores': 0.013, 'Answer Relevance Scores': 0.012}
