<h1><center>ARES Evaluation Strategies</h1></center>

<h2>IDP + UES</h2>
<p>Uses targeted prompts to enable pre-trained models to assess content relevance and accuracy in a zero-shot manner.</p>

In [None]:
from ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice" : "gpt-3.5-turbo-1106"
}

# Optional: Provide an alternative model of your choice below.
# Here are some models you can choose from:
# - mistralai/Mistral-7B-Instruct-v0.2
# - mistralai/Mixtral-8x7B-Instruct-v0.1
# - gpt-4-turbo-preview
# - microsoft/deberta-v3-large
# - openlm-research/open_llama_7b_v2
# - mosaicml/mpt-7b-instruct

In [2]:
ares_module = ARES(ues_idp=ues_idp_config)
results = ares_module.ues_idp()
print(results)

{'Context Relevance Scores': 0.25, 'Answer Faithfulness Scores': 0.005, 'Answer Relevance Scores': 0.004}


<h2>Training Classifier + UES</h2>

In [None]:
from ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "./datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice": "gpt-3.5-turbo-1106"
}

classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", #UES File
    "label_column": "Context_Relevance_Label", 
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

In [3]:
ares_modile = ARES(classifier_model=classifier_config)
results = ares_module.train_classifier()
print(results)

Accuracy for Test Set: 0.8137082601054482


<h2>Training Classifier + PPI + UES</h2>

<h3>UES</h3>

In [None]:
from ares import ARES

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Default model choice
    "model_choice" : "gpt-3.5-turbo-1106"
}

In [None]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

<h3>Training Classifier</h3>

<p>Generates checkpoint which is used in PPI below</p>

In [None]:
from ares import ARES

classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "./datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv", #UES File
    "label_column": "Context_Relevance_Label", 
    "model_choice": "microsoft/deberta-v3-large",
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

In [None]:
ares = ARES(classifier_model=classifier_config)
results = ares.train_classifier()
print(results)

<h3>PPI</h3>

In [1]:
from ares import ARES
import warnings

warnings.filterwarnings("ignore")

ppi_config = { 
    "evaluation_datasets": ['../data/datasets_v2/nq/nq_ratio_0.6_.tsv'], 
    "few_shot_examples_filepath": "../data/datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "checkpoints": ["../data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt", "../data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Answer_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt"],
    "labels": ["Context_Relevance_Label", "Answer_Relevance_Label"], 
    "GPT_scoring": False, 
    "gold_label_path": "../data/datasets_v2/nq/nq_ratio_0.6_.tsv", 
    "swap_human_labels_for_gpt4_labels": False
}


In [4]:
ares_module = ARES(ppi=ppi_config)
results = ares_module.evaluate_RAG()
print(results)

--------------------------------------------------------
Evaluation Sets: ['../data/datasets_v2/nq/nq_ratio_0.6_']
Checkpoints: ['../data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt', '../data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Answer_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt']
Labels: ['Context_Relevance_Label', 'Answer_Relevance_Label']
GPT Scoring: False
--------------------------------------------------------


FileNotFoundError: [Errno 2] No such file or directory: '../data/datasets_v2/nq/nq_ratio_0.6_'

<h2>ARES Comparison to RAGAS and Zeroshot Llama</h2>

<h3>ARES Configuration</h3>

<p>Synthetic Generator</p>

In [None]:
from ares import ARES
    
synth_config = { 
    "document_filepath": "datasets_v2/nq/ratio_0.5_reformatted_full_articles_False_validation_with_negatives.tsv",
    "few_shot_prompt_filename": "datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "synthetic_queries_filename": "output/synthetic_queries_1.tsv",
    "model_choice": "google/flan-t5-xxl", # Default Model Choice, provide alternative model as you wish. 
    "documents_sampled": 6381
}

ares = ARES(synthetic_query_generator=synth_config)
results = ares.generate_synthetic_data()
print(results)

<p>Training Classifier</p>

In [None]:
from ares import ARES

classifier_config = {
    "classification_dataset": "output/synthetic_queries_1.tsv", 
    "test_set_selection": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    "label_column": "Answer_Relevance_Label", 
    "model_choice": "microsoft/deberta-v3-large", # Default Model Choice, provide alternative model as you wish. 
    "num_epochs": 10, 
    "patience_value": 3, 
    "learning_rate": 5e-6
}

ares = ARES(classifier_model=classifier_config)
results = ares.train_classifier()
print(results)

<p>PPI</p>

In [8]:
from ares import ARES

ppi_config = { 
    "evaluation_datasets": ['../data/datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv'], 
    "few_shot_examples_filepath": "../data/datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    "checkpoints": ["../data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Context_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt", "/future/u/manihani/ARES/data/checkpoints/microsoft-deberta-v3-large/output-synthetic_queries_1.tsv/5e-06_1_True_Answer_Relevance_Label_ratio_0.6_reformatted_full_articles_False_validation_with_negatives_428380.pt"],
    "labels": ["Context_Relevance_Label", "Answer_Relevance_Label"], 
    "model_choice": "microsoft/deberta-v3-large", # Default Model Choice, provide alternative model as you wish. 
    "GPT_scoring": False, 
    "gold_label_path": "../data/datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    "swap_human_labels_for_gpt4_labels": False
}

ares_module = ARES(ppi=ppi_config)
results = ares_module.evaluate_RAG()
print(results)

 
--------------------------------------------------
Context_Relevance_Label Scoring
ARES Ranking
[0]
Avg. PPIs: [0.6157347655943952]
PPI Confidence Intervals: [[0.5422749567760038, 0.6891945744127868]]
Evaluation Set Lengths: [4421]
Evaluation Set Ratio: [0.6]
Test Accuracy Scores: [0.7733544446957702]
Y-Labeled Example Count: 153
--------------------------------------------------


--------------------------------------------------
Answer_Relevance_Label Scoring
ARES Ranking
[0]
Avg. PPIs: [0.6066581096943275]
PPI Confidence Intervals: [[0.5541757588683177, 0.6591404605203375]]
Evaluation Set Lengths: [4421]
Evaluation Set Ratio: [0.6]
Test Accuracy Scores: [0.8787604614340647]
Y-Labeled Example Count: 169
--------------------------------------------------


<h3>RAGAS Configuration</h3>

<p>Data Cleaning | Context Relevance Label Filter</p>

In [10]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

def load_and_prepare_dataset(file_path):
    # Load the dataset from the TSV file
    dataset_df = pd.read_csv(file_path, delimiter='\t')
    
    # Convert 'Context_Relevance_Label' to string if it is not already
    dataset_df['Context_Relevance_Label'] = dataset_df['Context_Relevance_Label'].astype(str)
    
    # Use 'Context_Relevance_Label' as 'ground_truth'
    prepared_data = {
        'question': dataset_df['Query'].tolist(),
        'contexts': [[doc] for doc in dataset_df['Document'].tolist()],  # Contexts are expected to be list of lists
        'answer': dataset_df['Answer'].tolist(),
        'ground_truth': dataset_df['Context_Relevance_Label'].tolist(),  # Using 'Context_Relevance_Label' as 'ground_truth'
    }
    
    # Convert to HuggingFace's Dataset format
    dataset = Dataset.from_dict(prepared_data)
    return dataset


<p>Context Relevance Accuracy</p>

In [12]:
from ragas import evaluate
from ragas.metrics import faithfulness, context_recall, context_precision
import os

file_path = "./datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv"  # Update this path
prepared_dataset = load_and_prepare_dataset(file_path)

# Specify metrics
metrics = [
    # Add or remove metrics based on your evaluation plan
    context_precision,
    context_recall,
]

# Evaluate
result = evaluate(prepared_dataset, metrics=[context_precision, context_recall])  # Update metrics as needed

# Assuming 'result' can be explored or exported as needed
print(result)

Evaluating:   0%|          | 0/12378 [00:00<?, ?it/s]

{'context_precision': 0.4424, 'context_recall': 0.4830}


<p>Data Cleaning | Answer Faithfulness Label Filter</p>

In [16]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

def load_and_prepare_dataset(file_path):
    # Load the dataset from the TSV file
    dataset_df = pd.read_csv(file_path, delimiter='\t')
    
    # Convert 'Context_Relevance_Label' to string if it is not already
    dataset_df['Answer_Faithfulness_Label'] = dataset_df['Answer_Faithfulness_Label'].astype(str)
    
    # Use 'Context_Relevance_Label' as 'ground_truth'
    prepared_data = {
        'question': dataset_df['Query'].tolist(),
        'contexts': [[doc] for doc in dataset_df['Document'].tolist()], 
        'answer': dataset_df['Answer'].tolist(),
        'ground_truth': dataset_df['Answer_Faithfulness_Label'].tolist(), 
    }
    
    # Convert to HuggingFace's Dataset format
    dataset = Dataset.from_dict(prepared_data)
    return dataset


In [None]:
from ragas import evaluate
from ragas.metrics import faithfulness, context_recall, context_precision
import os

file_path = "./datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv" 
prepared_dataset = load_and_prepare_dataset(file_path)

# Specify metrics
metrics = [
    faithfulness
]

# Evaluate
result = evaluate(prepared_dataset, metrics=[faithfulness]) 

print(result)

<h3>Zeroshot Llama Configuration</h3>

In [1]:
from ares import ARES


ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Model: Mistral 7B
    "model_choice" : "codellama/CodeLlama-13b-Instruct-hf"
}



In [2]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
Didn't extract Yes or No!
{'Context Relevance Scores': 0.94, 'Answer Faithfulness Scores': 0.013, 'Answer Relevance Scores': 0.012}


<h3>Zeroshot Mistral Configuration</h3>

In [8]:
from ares import ARES
import os

ues_idp_config = {
    # Dataset for in-domain prompts
    "in_domain_prompts_dataset": "../data/datasets/multirc_few_shot_prompt_for_synthetic_query_generation_v1.tsv",
    
    # Dataset for unlabeled evaluation
    "unlabeled_evaluation_set": "../data/datasets_v2/nq/ratio_0.6_reformatted_full_articles_False_validation_with_negatives.tsv", 
    
    # Model: Mistral 7B
    "model_choice" : "mistralai/Mixtral-8x7B-v0.1"
}

In [9]:
ares = ARES(ues_idp=ues_idp_config)
results = ares.ues_idp()
print(results)

InternalServerError: <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.together.xyz | 524: A timeout occurred</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">
              <span class="inline-block">A timeout occurred</span>
              <span class="code-label">Error code 524</span>
            </h1>
            <div>
               Visit <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.together.xyz" target="_blank" rel="noopener noreferrer">cloudflare.com</a> for more information.
            </div>
            <div class="mt-3">2024-03-15 07:29:38 UTC</div>
        </header>
        <div class="my-8 bg-gradient-gray">
            <div class="w-240 lg:w-full mx-auto">
                <div class="clearfix md:px-8">
                  
<div id="cf-browser-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    
    <span class="cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    
  </div>
  <span class="md:block w-full truncate">You</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    
    Browser
    
  </h3>
  <span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>

<div id="cf-cloudflare-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.together.xyz" target="_blank" rel="noopener noreferrer">
    <span class="cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    </a>
  </div>
  <span class="md:block w-full truncate">San Jose</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.together.xyz" target="_blank" rel="noopener noreferrer">
    Cloudflare
    </a>
  </h3>
  <span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>

<div id="cf-host-status" class="cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    
    <span class="cf-icon-server block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    
  </div>
  <span class="md:block w-full truncate">api.together.xyz</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    
    Host
    
  </h3>
  <span class="leading-1.3 text-2xl text-red-error">Error</span>
</div>

                </div>
            </div>
        </div>

        <div class="w-240 lg:w-full mx-auto mb-8 lg:px-8">
            <div class="clearfix">
                <div class="w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed">
                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What happened?</h2>
                    <p>The origin web server timed out responding to this request.</p>
                </div>
                <div class="w-1/2 md:w-full float-left leading-relaxed">
                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What can I do?</h2>
                          <h3 class="text-15 font-semibold mb-2">If you're a visitor of this website:</h3>
      <p class="mb-6">Please try again in a few minutes.</p>

      <h3 class="text-15 font-semibold mb-2">If you're the owner of this website:</h3>
      <p><span>The connection to the origin web server was made, but the origin web server timed out before responding. The likely cause is an overloaded background task, database or application, stressing the resources on your web server. To resolve, please work with your hosting provider or web development team to free up resources for your database or overloaded application.</span> <a rel="noopener noreferrer" href="https://support.cloudflare.com/hc/en-us/articles/200171926-Error-524">Additional troubleshooting information here.</a></p>
                </div>
            </div>
        </div>

        <div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
  <p class="text-13">
    <span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">864ac8b58cd306a9</strong></span>
    <span class="cf-footer-separator sm:hidden">&bull;</span>
    <span id="cf-footer-item-ip" class="cf-footer-item hidden sm:block sm:mb-1">
      Your IP:
      <button type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>
      <span class="hidden" id="cf-footer-ip">171.66.10.9</span>
      <span class="cf-footer-separator sm:hidden">&bull;</span>
    </span>
    <span class="cf-footer-item sm:block sm:mb-1"><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_524&utm_campaign=api.together.xyz" id="brand_link" target="_blank">Cloudflare</a></span>
    
  </p>
  <script>(function(){function d(){var b=a.getElementById("cf-footer-item-ip"),c=a.getElementById("cf-footer-ip-reveal");b&&"classList"in b&&(b.classList.remove("hidden"),c.addEventListener("click",function(){c.classList.add("hidden");a.getElementById("cf-footer-ip").classList.remove("hidden")}))}var a=document;document.addEventListener&&a.addEventListener("DOMContentLoaded",d)})();</script>
</div><!-- /.error-footer -->


    </div>
</div>
</body>
</html>