# Study Program Recommender System - Evaluation Notebook

This notebook provides tools to evaluate the recommendation system using standard IR metrics:
- **NDCG@k**: Normalized Discounted Cumulative Gain
- **Precision@k**: Proportion of relevant items in top-k recommendations
- **CTR**: Click-Through Rate
- **Acceptance Rate**: Percentage of recommendations accepted

## Setup

In [None]:
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
import matplotlib.pyplot as plt
import seaborn as sns
from supabase import create_client
import os
from dotenv import load_dotenv

load_dotenv()

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Connect to Database

In [None]:
supabase = create_client(
    os.getenv('SUPABASE_URL'),
    os.getenv('SUPABASE_ANON_KEY')
)

print("Connected to Supabase successfully!")

## Evaluation Metrics Implementation

In [None]:
def dcg_at_k(relevance_scores: List[float], k: int) -> float:
    """
    Calculate Discounted Cumulative Gain at k.
    
    Args:
        relevance_scores: List of relevance scores (higher is better)
        k: Number of top results to consider
    
    Returns:
        DCG@k score
    """
    relevance_scores = np.array(relevance_scores)[:k]
    if len(relevance_scores) == 0:
        return 0.0
    
    discounts = np.log2(np.arange(2, len(relevance_scores) + 2))
    return np.sum(relevance_scores / discounts)


def ndcg_at_k(predicted_relevance: List[float], ideal_relevance: List[float], k: int) -> float:
    """
    Calculate Normalized Discounted Cumulative Gain at k.
    
    Args:
        predicted_relevance: Relevance scores in predicted order
        ideal_relevance: Relevance scores in ideal (sorted) order
        k: Number of top results to consider
    
    Returns:
        NDCG@k score (0 to 1, higher is better)
    """
    dcg = dcg_at_k(predicted_relevance, k)
    idcg = dcg_at_k(sorted(ideal_relevance, reverse=True), k)
    
    if idcg == 0.0:
        return 0.0
    
    return dcg / idcg


def precision_at_k(predicted_items: List[str], relevant_items: List[str], k: int) -> float:
    """
    Calculate Precision at k.
    
    Args:
        predicted_items: List of predicted item IDs
        relevant_items: List of relevant item IDs
        k: Number of top results to consider
    
    Returns:
        Precision@k score (0 to 1, higher is better)
    """
    if k == 0:
        return 0.0
    
    predicted_at_k = set(predicted_items[:k])
    relevant_set = set(relevant_items)
    
    num_relevant_in_k = len(predicted_at_k.intersection(relevant_set))
    
    return num_relevant_in_k / k


def recall_at_k(predicted_items: List[str], relevant_items: List[str], k: int) -> float:
    """
    Calculate Recall at k.
    
    Args:
        predicted_items: List of predicted item IDs
        relevant_items: List of relevant item IDs
        k: Number of top results to consider
    
    Returns:
        Recall@k score (0 to 1, higher is better)
    """
    if len(relevant_items) == 0:
        return 0.0
    
    predicted_at_k = set(predicted_items[:k])
    relevant_set = set(relevant_items)
    
    num_relevant_in_k = len(predicted_at_k.intersection(relevant_set))
    
    return num_relevant_in_k / len(relevant_items)


def mean_reciprocal_rank(predicted_items_list: List[List[str]], relevant_items_list: List[List[str]]) -> float:
    """
    Calculate Mean Reciprocal Rank.
    
    Args:
        predicted_items_list: List of predicted item lists for each query
        relevant_items_list: List of relevant item lists for each query
    
    Returns:
        MRR score (0 to 1, higher is better)
    """
    reciprocal_ranks = []
    
    for predicted, relevant in zip(predicted_items_list, relevant_items_list):
        relevant_set = set(relevant)
        
        for rank, item in enumerate(predicted, start=1):
            if item in relevant_set:
                reciprocal_ranks.append(1.0 / rank)
                break
        else:
            reciprocal_ranks.append(0.0)
    
    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0


print("Evaluation metrics functions loaded successfully!")

## Load Feedback Data from Database

In [None]:
def load_feedback_data() -> pd.DataFrame:
    """
    Load feedback data from Supabase.
    
    Returns:
        DataFrame with feedback data
    """
    result = supabase.table('feedback').select('*').execute()
    return pd.DataFrame(result.data)


def load_recommendations_data() -> pd.DataFrame:
    """
    Load recommendations data from Supabase.
    
    Returns:
        DataFrame with recommendations data
    """
    result = supabase.table('recommendations').select('*').execute()
    return pd.DataFrame(result.data)


feedback_df = load_feedback_data()
recommendations_df = load_recommendations_data()

print(f"Loaded {len(feedback_df)} feedback records")
print(f"Loaded {len(recommendations_df)} recommendation records")

if len(feedback_df) > 0:
    print("\nFeedback data sample:")
    display(feedback_df.head())
else:
    print("\nNo feedback data available yet. Generate some recommendations and collect feedback first.")

## Calculate User Engagement Metrics

In [None]:
def calculate_engagement_metrics(feedback_df: pd.DataFrame, recommendations_df: pd.DataFrame) -> Dict:
    """
    Calculate user engagement metrics from feedback and recommendations data.
    
    Returns:
        Dictionary with engagement metrics
    """
    if len(recommendations_df) == 0:
        return {
            'total_recommendations': 0,
            'total_clicks': 0,
            'total_accepts': 0,
            'ctr': 0.0,
            'acceptance_rate': 0.0,
            'avg_rating': 0.0
        }
    
    total_recommendations = len(recommendations_df)
    
    total_clicks = feedback_df['clicked'].sum() if 'clicked' in feedback_df.columns and len(feedback_df) > 0 else 0
    total_accepts = feedback_df['accepted'].sum() if 'accepted' in feedback_df.columns and len(feedback_df) > 0 else 0
    
    ctr = (total_clicks / total_recommendations) * 100 if total_recommendations > 0 else 0.0
    acceptance_rate = (total_accepts / total_recommendations) * 100 if total_recommendations > 0 else 0.0
    
    ratings = feedback_df[feedback_df['rating'].notna()]['rating'] if 'rating' in feedback_df.columns and len(feedback_df) > 0 else []
    avg_rating = ratings.mean() if len(ratings) > 0 else 0.0
    
    return {
        'total_recommendations': int(total_recommendations),
        'total_clicks': int(total_clicks),
        'total_accepts': int(total_accepts),
        'ctr': float(ctr),
        'acceptance_rate': float(acceptance_rate),
        'avg_rating': float(avg_rating),
        'num_ratings': int(len(ratings))
    }


metrics = calculate_engagement_metrics(feedback_df, recommendations_df)

print("\n=== User Engagement Metrics ===")
print(f"Total Recommendations: {metrics['total_recommendations']}")
print(f"Total Clicks: {metrics['total_clicks']}")
print(f"Total Accepts: {metrics['total_accepts']}")
print(f"Click-Through Rate: {metrics['ctr']:.2f}%")
print(f"Acceptance Rate: {metrics['acceptance_rate']:.2f}%")
print(f"Average Rating: {metrics['avg_rating']:.2f}/5.0 ({metrics['num_ratings']} ratings)")

## Evaluate Recommendation Quality

In [None]:
def evaluate_recommendations(recommendations_df: pd.DataFrame, feedback_df: pd.DataFrame, k_values: List[int] = [1, 3, 5, 10]) -> pd.DataFrame:
    """
    Evaluate recommendation quality using NDCG@k and Precision@k.
    
    Uses feedback to determine relevance:
    - Accepted: 3 points
    - Clicked: 1 point
    - Rated: rating/5 * 2 points
    
    Returns:
        DataFrame with metrics for each k value
    """
    if len(recommendations_df) == 0 or len(feedback_df) == 0:
        print("Not enough data for evaluation. Need both recommendations and feedback.")
        return pd.DataFrame()
    
    feedback_map = {}
    for _, row in feedback_df.iterrows():
        key = (row['student_id'], row['program_id'])
        score = 0.0
        
        if row.get('clicked', False):
            score += 1.0
        if row.get('accepted', False):
            score += 3.0
        if pd.notna(row.get('rating')):
            score += (row['rating'] / 5.0) * 2.0
        
        feedback_map[key] = score
    
    student_recs = recommendations_df.groupby('student_id')
    
    results = []
    
    for k in k_values:
        ndcg_scores = []
        precision_scores = []
        recall_scores = []
        
        for student_id, group in student_recs:
            sorted_group = group.sort_values('score', ascending=False)
            
            predicted_programs = sorted_group['program_id'].tolist()
            
            relevance_scores = [
                feedback_map.get((student_id, prog_id), 0.0)
                for prog_id in predicted_programs
            ]
            
            relevant_programs = [
                prog_id for prog_id in predicted_programs
                if feedback_map.get((student_id, prog_id), 0.0) > 0
            ]
            
            if len(relevant_programs) > 0:
                ndcg = ndcg_at_k(relevance_scores, relevance_scores, k)
                ndcg_scores.append(ndcg)
                
                precision = precision_at_k(predicted_programs, relevant_programs, k)
                precision_scores.append(precision)
                
                recall = recall_at_k(predicted_programs, relevant_programs, k)
                recall_scores.append(recall)
        
        results.append({
            'k': k,
            'ndcg': np.mean(ndcg_scores) if ndcg_scores else 0.0,
            'precision': np.mean(precision_scores) if precision_scores else 0.0,
            'recall': np.mean(recall_scores) if recall_scores else 0.0,
            'num_users': len(ndcg_scores)
        })
    
    return pd.DataFrame(results)


evaluation_results = evaluate_recommendations(recommendations_df, feedback_df)

if len(evaluation_results) > 0:
    print("\n=== Recommendation Quality Metrics ===")
    display(evaluation_results)
else:
    print("\nNot enough data for evaluation. Collect more feedback first.")

## Visualize Metrics

In [None]:
if len(evaluation_results) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    axes[0].plot(evaluation_results['k'], evaluation_results['ndcg'], marker='o', linewidth=2, markersize=8)
    axes[0].set_xlabel('k', fontsize=12)
    axes[0].set_ylabel('NDCG@k', fontsize=12)
    axes[0].set_title('NDCG@k - Ranking Quality', fontsize=14, fontweight='bold')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(evaluation_results['k'], evaluation_results['precision'], marker='s', linewidth=2, markersize=8, color='green')
    axes[1].set_xlabel('k', fontsize=12)
    axes[1].set_ylabel('Precision@k', fontsize=12)
    axes[1].set_title('Precision@k - Relevance Rate', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3)
    
    axes[2].plot(evaluation_results['k'], evaluation_results['recall'], marker='^', linewidth=2, markersize=8, color='orange')
    axes[2].set_xlabel('k', fontsize=12)
    axes[2].set_ylabel('Recall@k', fontsize=12)
    axes[2].set_title('Recall@k - Coverage', fontsize=14, fontweight='bold')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No data to visualize. Collect feedback first.")

## Engagement Over Time

In [None]:
if len(feedback_df) > 0 and 'created_at' in feedback_df.columns:
    feedback_df['created_at'] = pd.to_datetime(feedback_df['created_at'])
    feedback_df['date'] = feedback_df['created_at'].dt.date
    
    daily_engagement = feedback_df.groupby('date').agg({
        'clicked': 'sum',
        'accepted': 'sum',
        'program_id': 'count'
    }).rename(columns={'program_id': 'total'})
    
    fig, ax = plt.subplots(figsize=(14, 6))
    
    daily_engagement.plot(kind='bar', ax=ax)
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('User Engagement Over Time', fontsize=14, fontweight='bold')
    ax.legend(['Clicks', 'Accepts', 'Total Interactions'])
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No temporal data available for visualization.")

## Export Evaluation Report

In [None]:
def generate_evaluation_report(metrics: Dict, evaluation_results: pd.DataFrame) -> str:
    """
    Generate a text report of evaluation results.
    """
    report = []
    report.append("=" * 60)
    report.append("STUDY PROGRAM RECOMMENDER - EVALUATION REPORT")
    report.append("=" * 60)
    report.append("")
    
    report.append("USER ENGAGEMENT METRICS")
    report.append("-" * 60)
    report.append(f"Total Recommendations: {metrics['total_recommendations']}")
    report.append(f"Click-Through Rate: {metrics['ctr']:.2f}%")
    report.append(f"Acceptance Rate: {metrics['acceptance_rate']:.2f}%")
    report.append(f"Average Rating: {metrics['avg_rating']:.2f}/5.0")
    report.append("")
    
    if len(evaluation_results) > 0:
        report.append("RECOMMENDATION QUALITY METRICS")
        report.append("-" * 60)
        for _, row in evaluation_results.iterrows():
            report.append(f"k={row['k']:2d}: NDCG={row['ndcg']:.4f}, Precision={row['precision']:.4f}, Recall={row['recall']:.4f}")
        report.append("")
    
    report.append("=" * 60)
    
    return "\n".join(report)


report = generate_evaluation_report(metrics, evaluation_results)
print(report)

with open('evaluation_report.txt', 'w') as f:
    f.write(report)

print("\nReport saved to evaluation_report.txt")

## Summary

This notebook provides comprehensive evaluation tools for the recommendation system:

1. **NDCG@k**: Measures ranking quality, considering position and relevance
2. **Precision@k**: Measures what proportion of recommendations are relevant
3. **Recall@k**: Measures what proportion of relevant items are recommended
4. **CTR & Acceptance**: Measures user engagement with recommendations

Use this notebook regularly to monitor system performance and identify areas for improvement.