In [None]:

%pip install ragas datasets python-dotenv ipywidgets


from dotenv import load_dotenv
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, context_precision, context_recall, answer_relevancy
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load API key
load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file. Please create one with OPENAI_API_KEY=sk-...")


question_widget = widgets.Textarea(
    value='',
    placeholder='Type the question here...',
    description='Question:',
    layout=widgets.Layout(width='100%', height='50px')
)

response_widget = widgets.Textarea(
    value='',
    placeholder='Type the LLM response here...',
    description='Response:',
    layout=widgets.Layout(width='100%', height='50px')
)

run_button = widgets.Button(description="Run Evaluation", button_style='success')
output = widgets.Output()

def on_button_clicked(b):
    with output:
        clear_output()
        
        question = question_widget.value.strip()
        response = response_widget.value.strip()
        
        if not question or not response:
            print("❌ Please provide both question and LLM response.")
            return
        
        # Use the response as a pseudo-context and ground truth
        context = response
        ground_truth = response
        
        print("[📝 INPUT SUMMARY]")
        print(f"Question:     {question}")
        print(f"Context:      {context}")
        print(f"Response:     {response}")
        print(f"Ground Truth: {ground_truth}")
        
        # Prepare data
        ragas_data = Dataset.from_dict({
            "question": [question],
            "contexts": [[context]],
            "answer": [response],
            "ground_truths": [[ground_truth]],
            "reference": [ground_truth]
        })
        
        # Evaluate
        results = evaluate(
            ragas_data,
            metrics=[faithfulness, context_precision, context_recall, answer_relevancy],
            column_map={
                "question": "question",
                "contexts": "contexts",
                "answer": "answer",
                "ground_truths": "ground_truths",
                "reference": "reference"
            }
        )

        print("\n=== AUTO-EVALUATION RESULTS ===")
        print(results.scores)
run_button.on_click(on_button_clicked)
display(question_widget, response_widget, run_button, output)


Note: you may need to restart the kernel to use updated packages.


Textarea(value='', description='Question:', layout=Layout(height='50px', width='100%'), placeholder='Type the …

Textarea(value='', description='Response:', layout=Layout(height='50px', width='100%'), placeholder='Type the …

Button(button_style='success', description='Run Evaluation', style=ButtonStyle())

Output()

In [None]:

from dotenv import load_dotenv
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, context_precision, context_recall, answer_relevancy

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file.")


question = input("Type your question: ").strip()
response = input("Type the LLM response: ").strip()

#  use response as context & ground truth
context = response
ground_truth = response

# Run eval
ragas_data = Dataset.from_dict({
    "question": [question],
    "contexts": [[context]],
    "answer": [response],
    "ground_truths": [[ground_truth]],
    "reference": [ground_truth]
})

results = evaluate(
    ragas_data,
    metrics=[faithfulness, context_precision, context_recall, answer_relevancy],
    column_map={
        "question": "question",
        "contexts": "contexts",
        "answer": "answer",
        "ground_truths": "ground_truths",
        "reference": "reference"
    }
)

print("\n=== AUTO-EVALUATION RESULTS ===")
print(results.scores)


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


=== AUTO-EVALUATION RESULTS ===
[{'faithfulness': 1.0, 'context_precision': 0.9999999999, 'context_recall': 1.0, 'answer_relevancy': np.float64(0.0)}]
