In [2]:
!pip3 install mlflow

Collecting mlflow
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting matplotlib<4 (from mlflow)
  Downloading matplotlib-3.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pyarrow<19,>=4.0.0 (from mlflow

In [3]:
import pandas as pd
import mlflow
from mlflow.metrics.genai import faithfulness, relevance

In [None]:
openai_api_key = ""

In [4]:
# Load the data from the CSV file
df = pd.read_csv('deepeval_evaluation.csv', encoding='latin1')

# Extract the relevant columns
inputs = df['input'].tolist()
actual_outputs = df['actual_output'].tolist()
expected_outputs = df['expected_output'].tolist()
retrieval_context = df['retrieval_context'].apply(lambda x: x.split(';') if isinstance(x, str) else [x]).tolist()

In [None]:
# Create a Pandas DataFrame for evaluation
eval_df = pd.DataFrame({
    "inputs": inputs,
    "outputs": actual_outputs,
    "ground_truth": expected_outputs,
    "context": retrieval_context,  # Rename retrieval_context to context
})

# Convert to an MLflow-compatible dataset
eval_dataset = mlflow.data.from_pandas(
    eval_df, predictions="outputs", targets="ground_truth"
)

# Define relevance and faithfulness metrics
relevance_metric = relevance(model="openai:/gpt-4")
faithfulness_metric = faithfulness(model="openai:/gpt-4")

print(relevance_metric)
print(faithfulness_metric)

In [None]:
with mlflow.start_run(run_name="RAG_Evaluation"):
    # Run evaluation using the default evaluator and additional metrics
    results = mlflow.evaluate(
        data=eval_dataset,
        model_type="question-answering",
        evaluators=["default"],  # Use default evaluator
        extra_metrics=[relevance_metric, faithfulness_metric],
    )

    # Extract the relevance and faithfulness scores from the results
    relevance_score = results.metrics.get("relevance_metric", None)
    faithfulness_score = results.metrics.get("faithfulness", None)

    # Print the relevance and faithfulness scores
    print(f"Relevance Score: {relevance_score}")
    print(f"Faithfulness Score: {faithfulness_score}")

    # Convert the evaluation results table to a DataFrame
    results_table = results.tables.get("eval_results_table")
    if results_table is not None:
        results_file_path = "rag_evaluation_results.csv"
        results_table.to_csv(results_file_path, index=False)
        print(f"Evaluation results saved to {results_file_path}")
    else:
        print("No results table available.")
