# ラガスを使用したRAGパイプラインの評価

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

### データ

In [43]:
from datasets import load_dataset

fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")['baseline']

# 'ground_truths' を 'ground_truth' に変更
def rename_ground_truths(example):
    example['ground_truth'] = example.pop('ground_truths')
    return example

fiqa_eval = fiqa_eval.map(rename_ground_truths)

fiqa_eval

Map: 100%|██████████| 30/30 [00:00<00:00, 2174.53 examples/s]


Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 30
})

### 指標

In [44]:
# import metrics
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from ragas.metrics.critique import harmfulness
 
# metrics you chose
metrics = [faithfulness, answer_relevancy, context_precision, harmfulness]    

In [45]:
from ragas.run_config import RunConfig
from ragas.metrics.base import MetricWithLLM, MetricWithEmbeddings
 
 
# util function to init Ragas Metrics
def init_ragas_metrics(metrics, llm, embedding):
    for metric in metrics:
        if isinstance(metric, MetricWithLLM):
            metric.llm = llm
        if isinstance(metric, MetricWithEmbeddings):
            metric.embeddings = embedding
        run_config = RunConfig()
        metric.init(run_config)

In [46]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
 
# wrappers
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
 
llm = ChatOpenAI(model="gpt-4o-mini")
emb = OpenAIEmbeddings(model="text-embedding-3-small")
 
init_ragas_metrics(
    metrics,
    llm=LangchainLLMWrapper(llm),
    embedding=LangchainEmbeddingsWrapper(emb),
)

セットアップ

トレースでスコアを付ける

In [47]:
row = fiqa_eval[0]
row['question'], row['answer']

('How to deposit a cheque issued to an associate in my business into my business account?',
 '\nThe best way to deposit a cheque issued to an associate in your business into your business account is to open a business account with the bank. You will need a state-issued "dba" certificate from the county clerk\'s office as well as an Employer ID Number (EIN) issued by the IRS. Once you have opened the business account, you can have the associate sign the back of the cheque and deposit it into the business account.')

In [52]:
from langfuse import Langfuse
 
langfuse = Langfuse()

In [53]:
langfuse.auth_check()

True

In [54]:
async def score_with_ragas(query, chunks, answer, ground_truth):
    scores = {}
    for m in metrics:
        print(f"calculating {m.name}")
        scores[m.name] = await m.ascore(
            row={"question": query, "contexts": chunks, "answer": answer, "ground_truth": ground_truth}  # 修正済み
        )
    return scores

In [55]:
# start a new trace when you get a question
question = row['question']
trace = langfuse.trace(name = "rag")
 
# retrieve the relevant chunks
# chunks = get_similar_chunks(question)
contexts = row['contexts']
# pass it as span
trace.span(
    name = "retrieval", input={'question': question}, output={'contexts': contexts}
)
 
# use llm to generate a answer with the chunks
# answer = get_response_from_llm(question, chunks)
answer = row['answer']
trace.span(
    name = "generation", input={'question': question, 'contexts': contexts}, output={'answer': answer}
)

ground_truth = row['ground_truth']
trace.span(
    name = "generation", input={'question': question, 'contexts': contexts}, output={'ground_truth': ground_truth}  # 修正済み
)


# compute scores for the question, context, answer tuple
ragas_scores = await score_with_ragas(question, contexts, answer, ground_truth)
ragas_scores

calculating faithfulness
calculating answer_relevancy
calculating context_precision
calculating harmfulness


{'faithfulness': 0.6,
 'answer_relevancy': 0.9413758834646305,
 'context_precision': 0.9999999999,
 'harmfulness': 0}

In [56]:
# send the scores
for m in metrics:
    trace.score(name=m.name, value=ragas_scores[m.name])

バッチとしてスコアリング

In [57]:
# fiqa traces
for interaction in fiqa_eval.select(range(10, 20)):
    trace = langfuse.trace(name = "rag")
    trace.span(
        name = "retrieval",
        input={'question': question},
        output={'contexts': contexts}
    )
    trace.span(
        name = "generation",
        input={'question': question, 'contexts': contexts},
        output={'answer': answer}
    )
 
# await that Langfuse SDK has processed all events before trying to retrieve it in the next step
langfuse.flush()

In [58]:
def get_traces(name=None, limit=None, user_id=None):
    all_data = []
    page = 1
 
    while True:
        response = langfuse.client.trace.list(
            name=name, page=page, user_id=user_id
        )
        if not response.data:
            break
        page += 1
        all_data.extend(response.data)
        if len(all_data) > limit:
            break
 
    return all_data[:limit]

In [59]:
from random import sample
 
NUM_TRACES_TO_SAMPLE = 3
traces = get_traces(name='rag', limit=5)
traces_sample = sample(traces, NUM_TRACES_TO_SAMPLE)
 
len(traces_sample)

3

In [60]:
# score on a sample
from random import sample
 
evaluation_batch = {
    "question": [],
    "contexts": [],
    "answer": [],
    "trace_id": [],
}
 
for t in traces_sample:
    observations = [langfuse.client.observations.get(o) for o in t.observations]
    for o in observations:
        if o.name == 'retrieval':
            question = o.input['question']
            contexts = o.output['contexts']
        if o.name=='generation':
            answer = o.output['answer']
    evaluation_batch['question'].append(question)
    evaluation_batch['contexts'].append(contexts)
    evaluation_batch['answer'].append(answer)
    evaluation_batch['trace_id'].append(t.id)

In [61]:
# run ragas evaluate
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
 
ds = Dataset.from_dict(evaluation_batch)
r = evaluate(ds, metrics=[faithfulness, answer_relevancy])

Evaluating: 100%|██████████| 6/6 [00:09<00:00,  1.50s/it]


In [62]:
df = r.to_pandas()
 
# add the langfuse trace_id to the result dataframe
df["trace_id"] = ds["trace_id"]
 
df.head()

Unnamed: 0,question,contexts,answer,trace_id,faithfulness,answer_relevancy
0,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,\nThe best way to deposit a cheque issued to a...,29d4ab60-82cb-435a-9ee5-dd992e13fa5a,0.6,0.941574
1,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,\nThe best way to deposit a cheque issued to a...,933e5b67-2a06-48e9-812d-3604867bf4c4,0.6,0.941582
2,How to deposit a cheque issued to an associate...,[Just have the associate sign the back and the...,\nThe best way to deposit a cheque issued to a...,6f909d17-b73a-43ff-97fe-f4dd90b01de7,0.6,0.941582


In [63]:
for _, row in df.iterrows():
    for metric_name in ["faithfulness", "answer_relevancy"]:
        langfuse.score(
            name=metric_name,
            value=row[metric_name],
            trace_id=row["trace_id"]
        )