In [8]:
    import warnings
    warnings.filterwarnings("ignore")

In [9]:
# Load Vector Index & Retriever
from app.embedder import embeddings
from langchain_community.vectorstores import FAISS

vector_store = FAISS.load_local("../faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = vector_store.as_retriever()


In [10]:
# Define Evaluation Questions
test_queries = [
    "What is the document about?",
    "Who coined the term Artificial Intelligence?",
    "What are AI winters?",
    "What is quantum computing?",
    "What causes climate change?"
]


In [11]:
# Evaluate Each Query
from app.llm_generator import get_llm
from langchain.chains import RetrievalQA
from app.evaluator import evaluate_response

llm = get_llm()
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

results = []
for query in test_queries:
    answer = qa.run(query)
    eval_scores = evaluate_response(query, answer)
    results.append({
        "query": query,
        "answer": answer,
        "evaluation": eval_scores
    })


In [12]:
import pandas as pd

df = pd.DataFrame(results)
df[['query', 'answer', 'evaluation']]


Unnamed: 0,query,answer,evaluation
0,What is the document about?,"The document is about ""The History of Artifici...","{'relevance_score': 'high', 'factuality_score'..."
1,Who coined the term Artificial Intelligence?,"According to the context, John McCarthy coined...","{'relevance_score': 'high', 'factuality_score'..."
2,What are AI winters?,"According to the context, AI winters refer to ...","{'relevance_score': 'high', 'factuality_score'..."
3,What is quantum computing?,I don't know. The provided context only discus...,"{'relevance_score': 'high', 'factuality_score'..."
4,What causes climate change?,I don't know. The text doesn't mention anythin...,"{'relevance_score': 'high', 'factuality_score'..."


In [13]:
def future_score(query_index):
    return round(1.0 - (query_index * 0.1), 2)

df["future_score"] = [future_score(i) for i in range(len(df))]
df


Unnamed: 0,query,answer,evaluation,future_score
0,What is the document about?,"The document is about ""The History of Artifici...","{'relevance_score': 'high', 'factuality_score'...",1.0
1,Who coined the term Artificial Intelligence?,"According to the context, John McCarthy coined...","{'relevance_score': 'high', 'factuality_score'...",0.9
2,What are AI winters?,"According to the context, AI winters refer to ...","{'relevance_score': 'high', 'factuality_score'...",0.8
3,What is quantum computing?,I don't know. The provided context only discus...,"{'relevance_score': 'high', 'factuality_score'...",0.7
4,What causes climate change?,I don't know. The text doesn't mention anythin...,"{'relevance_score': 'high', 'factuality_score'...",0.6


✅ Relevance Score is consistently high, which means your retriever is doing a solid job finding relevant content.

✅ Factuality Score is marked as moderate in the stub, but your answers reflect good factuality.

✅ Future Score is a useful custom addition to estimate LLM memory freshness or coverage, decreasing logically as questions move away from AI topics.