In [None]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

In [None]:
GLOBAL_K_VALUE: int = 5

Load Data Files

In [None]:
def load_data(file_path: str) -> pd.DataFrame:
    """Load data from a CSV file."""
    return pd.read_csv(file_path)

In [None]:
# Anime dataset
anime_filtered = load_data('../data/clean/anime-dataset-filtered.csv')

# Recommendation datasets
user_based = load_data('../data/prediction/users-recommendations.csv')
content_based = load_data('../data/prediction/content-recommendations.csv')

# Validation dataset for 2025
watch_shrunk_25 = load_data('../data/clean/users-score-shrunk-2025.csv')

Transform the Anime Dataset

In [None]:
def transform_anime_data(anime_df: pd.DataFrame) -> pd.DataFrame:
    """Filter relevant columns from anime data."""
    anime_df['All Genres'] = (anime_df[['Genres', 'Explicit Genres', 'Themes', 'Demographics']].apply(lambda row: ', '.join(
            dict.fromkeys(sum([[s.strip() for s in x.split(',')] for x in row if pd.notna(x)], []))),
            axis=1)
    )
    return anime_df.set_index('anime_id')

In [None]:
anime_transformed = transform_anime_data(anime_filtered)

# Create genre to index mapping, later used for evaluation
all_genres = sorted({g for lst in anime_transformed['All Genres'] for g in lst.split(', ')})
genre_to_idx = {g:i for i, g in enumerate(all_genres)}

In [None]:
def vectorize_genres(genre_list: list) -> np.ndarray:
    """Convert a list of genres into a binary vector."""
    vec = np.zeros(len(all_genres))
    for g in genre_list:
        vec[genre_to_idx[g]] = 1
    return vec

In [None]:
anime_transformed['Genre Vector'] = anime_transformed['All Genres'].apply(lambda x: vectorize_genres(x.split(', ')))

Check for Missing Users

In [None]:
missing_users_user_based = set(user_based['user_id']).difference(set(watch_shrunk_25['user_id']))
missing_users_content_based = set(content_based['user_id']).difference(set(watch_shrunk_25['user_id']))

print(f"Missing users in user-based recommendations: {missing_users_user_based}")
print(f"Missing users in content-based recommendations: {missing_users_content_based}")

Metric Functions

In [None]:
def recall_at_k(y_true: np.ndarray, y_scores: np.ndarray, k: int) -> float:
    """Compute recall@k for binary classification."""
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    tp_at_k = y_true[top_k_idx].sum()
    total_positives = y_true.sum()
    return tp_at_k / total_positives

def precision_at_k(y_true: np.ndarray, y_scores: np.ndarray, k: int) -> float:
    """Compute Precision@K for binary classification."""
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    tp_at_k = y_true[top_k_idx].sum()
    return tp_at_k / k

def genre_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Compute Euclidean distance between two genre vectors."""
    return norm(vec1 - vec2)

def genre_distance_at_k(df: pd.DataFrame, true_dict: dict, pred_dict: dict, k: int) -> float:
    """Compute average genre distance at k between true and predicted items."""
    distances = []

    for user, true_items in true_dict.items():
        recs = pred_dict[user][:k]

        true_vecs = df.loc[true_items, 'genre_vec'].values
        rec_vecs = df.loc[recs, 'genre_vec'].values

        for t in true_vecs:
            for r in rec_vecs:
                distances.append(genre_distance(t, r))

    return np.mean(distances)

Calculate Quality Metrics

Calculate Diversity Metrics