# Model Evaluation and Analysis 📊

This notebook covers:
1. Model Performance Metrics
2. A/B Testing Analysis
3. Feature Importance
4. Error Analysis
5. Recommendation Quality Assessment

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, ndcg_score
import joblib
import torch

## 1. Load Models and Test Data

In [None]:
# Load models
cf_model = joblib.load('../models/cf_model.joblib')
cb_model = joblib.load('../models/cb_model.joblib')
hybrid_model = joblib.load('../models/hybrid_model.joblib')

# Load test data
X_test = np.load('../data/X_test.npy')
y_test = np.load('../data/y_test.npy')

print("Models and data loaded successfully!")

## 2. Performance Metrics

In [None]:
def calculate_metrics(y_true, y_pred):
    metrics = {
        'precision': precision_score(y_true, y_pred, average='weighted'),
        'recall': recall_score(y_true, y_pred, average='weighted'),
        'ndcg': ndcg_score([y_true], [y_pred])
    }
    return metrics

# Get predictions from each model
cf_predictions = cf_model.predict(X_test)
cb_predictions = cb_model.predict(X_test)
hybrid_predictions = hybrid_model.predict(X_test)

# Calculate metrics
models = ['Collaborative Filtering', 'Content-Based', 'Hybrid']
predictions = [cf_predictions, cb_predictions, hybrid_predictions]

for model_name, preds in zip(models, predictions):
    metrics = calculate_metrics(y_test, preds)
    print(f"\n{model_name} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

## 3. A/B Testing Analysis

In [None]:
from scipy import stats

def ab_test_analysis(control_metrics, test_metrics):
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(control_metrics, test_metrics)
    
    # Calculate effect size
    effect_size = (np.mean(test_metrics) - np.mean(control_metrics)) / np.std(control_metrics)
    
    return {
        't_statistic': t_stat,
        'p_value': p_value,
        'effect_size': effect_size
    }

# Compare Hybrid vs CF model
ab_results = ab_test_analysis(
    cf_predictions,
    hybrid_predictions
)

print("A/B Test Results (Hybrid vs CF):")
for metric, value in ab_results.items():
    print(f"{metric}: {value:.4f}")

## 4. Feature Importance Analysis

In [None]:
def analyze_feature_importance(model, feature_names):
    # Get feature importance scores
    importance_scores = model.feature_importances_
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance_scores
    }).sort_values('importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df, x='importance', y='feature')
    plt.title('Feature Importance')
    plt.show()
    
    return importance_df

# Analyze feature importance
feature_names = ['Category', 'Brand', 'Price', 'Color', 'Season']
importance_df = analyze_feature_importance(hybrid_model, feature_names)
print("\nTop 5 Most Important Features:")
print(importance_df.head())

## 5. Error Analysis

In [None]:
def analyze_errors(y_true, y_pred, X_test):
    # Calculate errors
    errors = np.abs(y_true - y_pred)
    
    # Create error analysis DataFrame
    error_df = pd.DataFrame({
        'true': y_true,
        'predicted': y_pred,
        'error': errors,
        'features': list(X_test)
    })
    
    # Plot error distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(data=error_df, x='error', bins=50)
    plt.title('Error Distribution')
    plt.show()
    
    # Analyze high error cases
    high_errors = error_df[error_df['error'] > error_df['error'].quantile(0.95)]
    
    return high_errors

# Perform error analysis
high_errors = analyze_errors(y_test, hybrid_predictions, X_test)
print("\nHigh Error Cases Analysis:")
print(high_errors.describe())

## 6. Recommendation Quality Assessment

In [None]:
def assess_recommendation_quality(model, test_cases):
    quality_metrics = {
        'relevance': [],
        'diversity': [],
        'novelty': []
    }
    
    for case in test_cases:
        # Get recommendations
        recs = model.recommend(case)
        
        # Calculate metrics
        quality_metrics['relevance'].append(calculate_relevance(recs))
        quality_metrics['diversity'].append(calculate_diversity(recs))
        quality_metrics['novelty'].append(calculate_novelty(recs))
    
    # Plot quality metrics
    plt.figure(figsize=(12, 4))
    for i, (metric, values) in enumerate(quality_metrics.items(), 1):
        plt.subplot(1, 3, i)
        sns.boxplot(y=values)
        plt.title(f'{metric.capitalize()} Distribution')
    plt.tight_layout()
    plt.show()
    
    return quality_metrics

# Assess recommendation quality
test_cases = np.random.choice(len(X_test), 100)
quality_metrics = assess_recommendation_quality(hybrid_model, test_cases)

print("\nRecommendation Quality Summary:")
for metric, values in quality_metrics.items():
    print(f"{metric}: {np.mean(values):.4f} ± {np.std(values):.4f}")

## 7. Save Evaluation Results

In [None]:
# Compile all evaluation results
evaluation_results = {
    'performance_metrics': {
        'cf': calculate_metrics(y_test, cf_predictions),
        'cb': calculate_metrics(y_test, cb_predictions),
        'hybrid': calculate_metrics(y_test, hybrid_predictions)
    },
    'ab_test_results': ab_results,
    'feature_importance': importance_df.to_dict(),
    'quality_metrics': quality_metrics
}

# Save results
import json
with open('../results/evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f, indent=4)

print("Evaluation results saved successfully!")