In [1]:
!pip3 install mlflow



In [2]:
import pandas as pd
import mlflow
from mlflow.metrics.genai import faithfulness, relevance
import openai
import os

In [3]:
os.environ["OPENAI_API_KEY"] =  ""

In [4]:
# Load the data from the CSV file
df = pd.read_csv('RAG_evaluation.csv', encoding='latin1')
print(df.head())
# Extract the relevant columns
inputs = df['input'].tolist()
actual_outputs = df['actual_output'].tolist()
expected_outputs = df['expected_output'].tolist()
retrieval_context = df['retrieval_context'].apply(lambda x: [x] if isinstance(x, str) else None).tolist()

                                               input  \
0       What is the National Hydrogen Strategy 2050?   
1                    What is the Environmental Code?   
2  What are the targets of the Energy Policy Fram...   
3                       What is the Electricity Act?   
4  What is the Vision 2030 strategy for Saudi Ara...   

                                     expected_output  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The Environmental Code regulates activities im...   
2  The Energy Policy Framework aims to reduce ene...   
3  The Electricity Act sets regulations on energy...   
4  Saudi Arabia aims to increase the share of nat...   

                                   retrieval_context  \
0  ten points that the government will place effo...   
1  a number of policies affecting road traffic ha...   
2  the UK's 2nd Energy Efficiency Action Plan (EE...   
3  a number of policies affecting road traffic ha...   
4  ten points that the government will place e

In [7]:
# Create a Pandas DataFrame for evaluation
eval_df = pd.DataFrame({
    "inputs": inputs,
    "outputs": actual_outputs,
    "ground_truth": expected_outputs,
    "context": retrieval_context,  # Rename retrieval_context to context
})

# Convert to an MLflow-compatible dataset
eval_dataset = mlflow.data.from_pandas(
    eval_df, predictions="outputs", targets="ground_truth"
)

# Define relevance and faithfulness metrics
relevance_metric = relevance(model="openai:/gpt-4o-mini")
faithfulness_metric = faithfulness(model="openai:/gpt-4o-mini")

print(relevance_metric)
print(faithfulness_metric)

EvaluationMetric(name=relevance, greater_is_better=True, long_name=relevance, version=v1, metric_details=
Task:
You must return the following fields in your response in two lines, one below the other:
score: Your numerical score for the model's relevance based on the rubric
justification: Your reasoning about the model's relevance score

You are an impartial judge. You will be given an input that was sent to a machine
learning model, and you will be given an output that the model produced. You
may also be given additional information that was used by the model to generate the output.

Your task is to determine a numerical score called relevance based on the input and output.
A definition of relevance and a grading rubric are provided below.
You must use the grading rubric to determine your score. You must also justify your score.

Examples could be included below for reference. Make sure to use them as references and to
understand them before completing the task.

Input:
{input}

Outpu

In [9]:
with mlflow.start_run(run_name="RAG_Evaluation"):
    # Run evaluation using the default evaluator and additional metrics
    results = mlflow.evaluate(
        data=eval_dataset,
        model_type="question-answering",
        evaluators=["default"],  # Use default evaluator
        extra_metrics=[relevance_metric, faithfulness_metric],
    )

    # Extract the relevance and faithfulness scores from the results
    relevance_score = results.metrics.get("relevance_metric", None)
    faithfulness_score = results.metrics.get("faithfulness", None)

    # Convert the evaluation results table to a DataFrame
    results_table = results.tables.get("eval_results_table")
    if results_table is not None:
        results_file_path = "rag_evaluation_results.csv"
        results_table.to_csv(results_file_path, index=False)
        print(f"Evaluation results saved to {results_file_path}")
    else:
        print("No results table available.")

2024/12/29 20:53:10 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.70s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:12<00:00,  6.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 82/82 [00:21<00:00,  3.90it/s]
Downloading artifacts: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 897.37it/s]
Downloading artifacts: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 834.36it/s]

Evaluation results saved to rag_evaluation_results.csv





In [10]:
# Load the results table (CSV file) into a Pandas DataFrame
results_file_path = "rag_evaluation_results.csv"
df = pd.read_csv(results_file_path)
# Print the column names
print("Column Names:", df.columns.tolist())

Column Names: ['inputs', 'context', 'ground_truth', 'outputs', 'token_count', 'flesch_kincaid_grade_level/v1/score', 'ari_grade_level/v1/score', 'relevance/v1/score', 'relevance/v1/justification', 'faithfulness/v1/score', 'faithfulness/v1/justification']


In [12]:
# Calculate the average value of the "relevance_metric" and "faithfulness" columns
if "relevance/v1/score" in df.columns:
    average_relevance_score = df["relevance/v1/score"].mean()
    print(f"Average Relevance Score from CSV: {average_relevance_score}")
else:
    print("Relevance metric column not found in the CSV file.")

if "faithfulness/v1/score" in df.columns:
    average_faithfulness_score = df["faithfulness/v1/score"].mean()
    print(f"Average Faithfulness Score from CSV: {average_faithfulness_score}")
else:
    print("Faithfulness column not found in the CSV file.")

Average Relevance Score from CSV: 4.170731707317073
Average Faithfulness Score from CSV: 3.6951219512195124
