In [1]:
import pandas as pd
import numpy as np
import os
import sys
from typing import List, Dict
from tqdm import tqdm

In [14]:
sys.path.append(r"C:\Users\devanshi\SHL-Assessment-Recommendation-System_Devanshi-Singh\src")

from llm.llm_recommender import LLMRecommender
from embeddings.faiss_wrapper import FaissIndex

ImportError: attempted relative import beyond top-level package

In [9]:
test_df = pd.read_csv("test.csv")
print(f"Loaded {len(test_df)} test samples")

Loaded 45 test samples


In [10]:
# Group ground truth by query
ground_truth = {}
for _, row in test_df.iterrows():
    query = row['Query']
    assessment = row['Assessments']
    if query not in ground_truth:
        ground_truth[query] = []
    ground_truth[query].append(assessment)

print(f"Found {len(ground_truth)} unique queries")

Found 7 unique queries


In [11]:
# Define evaluation metrics
def calculate_metrics(relevant: List[str], recommended: List[str], k_values: List[int]):
    results = {"recall": {}, "ap": {}}
    
    for k in k_values:
        # Calculate Recall@K
        recommended_k = recommended[:k]
        relevant_found = set(relevant).intersection(set(recommended_k))
        recall = len(relevant_found) / len(relevant) if relevant else 0
        
        # Calculate AP@K
        ap = 0.0
        hits = 0
        for i, item in enumerate(recommended_k):
            if item in relevant:
                hits += 1
                ap += hits / (i + 1)
        ap = ap / min(k, len(relevant)) if min(k, len(relevant)) > 0 else 0
        
        results["recall"][k] = recall
        results["ap"][k] = ap
    
    return results

In [None]:
# Initialize recommender
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    print("Warning: GEMINI_API_KEY not found")
    api_key = input("Enter your Gemini API key: ")

try:
    index_path = input("Enter path to FAISS index: ")
    vector_index = FaissIndex.load(index_path)
    recommender = LLMRecommender(vector_index=vector_index, api_key=api_key)
    print("Recommender initialized successfully")
except Exception as e:
    print(f"Error initializing: {e}")
    sys.exit(1)

In [None]:
# Evaluation parameters
k_values = [1, 3, 5, 10]
metrics = {"recall": {k: [] for k in k_values}, "ap": {k: [] for k in k_values}}

# Run evaluation
print("\nEvaluating recommender system...")
for query, relevant in tqdm(ground_truth.items()):
    try:
        # Get recommendations
        recommendations = recommender.recommend(
            job_description=query,
            top_k=20,
            rerank=True,
            final_results=max(k_values)
        )
        
        # Extract assessment names
        recommended_names = [rec.get("name", "") for rec in recommendations]
        
        # Calculate metrics
        query_metrics = calculate_metrics(relevant, recommended_names, k_values)
        
        # Store results
        for k in k_values:
            metrics["recall"][k].append(query_metrics["recall"][k])
            metrics["ap"][k].append(query_metrics["ap"][k])
            
    except Exception as e:
        print(f"Error processing query '{query[:30]}...': {e}")
        for k in k_values:
            metrics["recall"][k].append(0.0)
            metrics["ap"][k].append(0.0)

# Calculate mean metrics
mean_metrics = {
    "Mean Recall": {k: np.mean(metrics["recall"][k]) for k in k_values},
    "MAP": {k: np.mean(metrics["ap"][k]) for k in k_values}
}

# Print results
print("\n===== EVALUATION RESULTS =====")
print(f"{'Metric':<15} | {'Value':<10} | {'K Value':<7}")
print("-" * 40)

for k in k_values:
    print(f"{'Mean Recall':<15} | {mean_metrics['Mean Recall'][k]:.4f}    | {k:<7}")
    print(f"{'MAP':<15} | {mean_metrics['MAP'][k]:.4f}    | {k:<7}")
    print("-" * 40)

# Save results
results = {
    "mean_metrics": mean_metrics,
    "queries_evaluated": len(ground_truth),
    "timestamp": pd.Timestamp.now().isoformat()
}

import json
with open("evaluation_results.json", "w") as f:
    json.dump(results, f, indent=2)
print("Results saved to evaluation_results.json")

# Summary
print("\nEVALUATION SUMMARY:")
print(f"- System achieved Mean Recall@3 of {mean_metrics['Mean Recall'][3]:.4f} ({mean_metrics['Mean Recall'][3]*100:.1f}%)")
print(f"- MAP@3 score is {mean_metrics['MAP'][3]:.4f}")
print(f"- Vector search with LLM reranking shows {'+' if mean_metrics['MAP'][3] > 0.5 else '-'}performance")
print(f"- Best performing K value: {max(k_values, key=lambda k: mean_metrics['MAP'][k])}")

# Key observations
if mean_metrics['Mean Recall'][3] >= 0.6:
    print("+ Strong recall performance indicates good coverage of relevant assessments")
else:
    print("- Lower recall suggests need for improved candidate selection")
    
if mean_metrics['MAP'][3] >= 0.5:
    print("+ Good MAP scores show effective ranking of relevant assessments")
else:
    print("- MAP scores indicate room for improvement in reranking quality")