In [74]:
import logging
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [75]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("RAG_Evaluation")

In [76]:
retriever_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
generator = pipeline("text-generation", model="gpt2")
nli_model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
nli_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
fluency_model = GPT2LMHeadModel.from_pretrained("gpt2")
fluency_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")



In [77]:
knowledge_base = [
    {"id": 1, "text": "The Eiffel Tower is located in Paris."},
    {"id": 2, "text": "james cameroon directed titanic and avataar"},
    {"id": 3, "text": "Python is a popular programming language for data science."},
]

In [78]:
def retrieve_context(query):
    query_embedding = retriever_model.encode(query)
    context_scores = [
        (doc, cosine_similarity([query_embedding], [retriever_model.encode(doc["text"])])[0][0])
        for doc in knowledge_base
    ]
    sorted_context = sorted(context_scores, key=lambda x: x[1], reverse=True)
    return sorted_context[:1]

In [79]:
def generate_response(query, context):
    prompt = f"{query} Context: {' '.join([c[0]['text'] for c in context])}"
    response = generator(prompt, max_length=20)[0]["generated_text"]
    return response

In [80]:
def evaluate_relevance(query, context):
    return sum(cosine_similarity([retriever_model.encode(query)], [retriever_model.encode(doc[0]["text"])])[0][0] for doc in context) / len(context)


In [81]:
def evaluate_factual_consistency(response, context):
    entailment_score = 0
    for doc, _ in context:
        inputs = nli_tokenizer.encode_plus(f"{doc['text']}", f"{response}", return_tensors="pt")
        outputs = nli_model(**inputs)
        probs = outputs.logits.softmax(dim=1)
        entailment_score += probs[0][2].item()
    return entailment_score / len(context)


In [82]:
def evaluate_fluency(response):
    inputs = fluency_tokenizer(response, return_tensors="pt")
    loss = fluency_model(**inputs, labels=inputs["input_ids"]).loss
    perplexity = np.exp(loss.item())
    return perplexity

In [83]:
def evaluate_coverage(query, response):
    inputs = nli_tokenizer.encode_plus(query, response, return_tensors="pt")
    outputs = nli_model(**inputs)
    probs = outputs.logits.softmax(dim=1)
    coverage_score = probs[0][2].item()
    return coverage_score

In [84]:
def calculate_mrr(retrieval_ranks):
    reciprocal_ranks = [1/rank for rank in retrieval_ranks]
    mrr = sum(reciprocal_ranks) / len(retrieval_ranks)
    return mrr

In [85]:
def calculate_map(retrieval_ranks):
    precision_at_k = [(i+1) / rank for i, rank in enumerate(retrieval_ranks)]
    map_score = sum(precision_at_k) / len(retrieval_ranks)
    return map_score

In [86]:
def evaluate_retrieval_accuracy(query, context):
    query_embedding = retriever_model.encode(query)
    all_doc_scores = [(doc, cosine_similarity([query_embedding], [retriever_model.encode(doc["text"])])[0][0]) for doc in knowledge_base]
    all_doc_scores = sorted(all_doc_scores, key=lambda x: x[1], reverse=True)
    retrieval_ranks = []
    for i, (doc, score) in enumerate(all_doc_scores):
        if any(doc["id"] == c[0]["id"] for c in context):
            retrieval_ranks.append(i + 1)

    mrr = calculate_mrr(retrieval_ranks)
    map_score = calculate_map(retrieval_ranks)

    return {"MRR": mrr, "MAP": map_score}

In [87]:
def rag_evaluation(query):
    context = retrieve_context(query)
    response = generate_response(query, context)

    relevance = evaluate_relevance(query, context)
    factual_consistency = evaluate_factual_consistency(response, context)
    fluency = evaluate_fluency(response)
    coverage = evaluate_coverage(query, response)
    retrieval_accuracy = evaluate_retrieval_accuracy(query, context)
    logger.info(f"Factual Consistency: {factual_consistency}")
    logger.info(f"Fluency (Perplexity): {fluency}")
    logger.info(f"Coverage: {coverage}")
    logger.info(f"Retrieval Accuracy (MRR): {retrieval_accuracy['MRR']}")
    logger.info(f"Retrieval Accuracy (MAP): {retrieval_accuracy['MAP']}")
    return {
        "response": response,
        "relevance": relevance,
        "factual_consistency": factual_consistency,
        "fluency": fluency,
        "coverage": coverage,
        "retrieval_accuracy": retrieval_accuracy
    }

In [88]:
query = "who directed titanic?"
results = rag_evaluation(query)


print("RAG Evaluation Results:", results)
for metric, score in results.items():
    print(f"{metric.capitalize()}: {score}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RAG Evaluation Results: {'response': "who directed titanic? Context: james cameroon directed titanic and avataar's studio", 'relevance': 0.6143321990966797, 'factual_consistency': 0.024692179635167122, 'fluency': 881.6091098568681, 'coverage': 0.0027736832853406668, 'retrieval_accuracy': {'MRR': 1.0, 'MAP': 1.0}}
Response: who directed titanic? Context: james cameroon directed titanic and avataar's studio
Relevance: 0.6143321990966797
Factual_consistency: 0.024692179635167122
Fluency: 881.6091098568681
Coverage: 0.0027736832853406668
Retrieval_accuracy: {'MRR': 1.0, 'MAP': 1.0}
