In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Recommender System Evaluation Script

This script downloads datasets, implements basic recommender systems,
evaluates them using various metrics, and visualizes the results.
"""

import os
import time
import argparse
import zipfile
import io
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from scipy.spatial.distance import cosine
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from surprise import Dataset, Reader, SVD, KNNBasic, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split as surprise_train_test_split
from surprise.accuracy import rmse as surprise_rmse, mae as surprise_mae

# Set random seed for reproducibility
np.random.seed(42)

# =============================================================================
# Dataset Loading Functions
# =============================================================================

def download_file(url, save_path=None):
    """
    Download a file from a URL
    
    Parameters:
    -----------
    url : str
        URL to download from
    save_path : str, optional
        Path to save the file. If None, file is returned as bytes
        
    Returns:
    --------
    bytes or bool
        File content as bytes if save_path is None, else True if successful
    """
    print(f"Downloading from {url}...")
    response = requests.get(url, stream=True)
    
    if response.status_code != 200:
        print(f"Error downloading file: {response.status_code}")
        return None
    
    if save_path:
        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"File saved to {save_path}")
        return True
    else:
        return response.content

def download_movielens(size='100k', data_dir='data'):
    """
    Download MovieLens dataset
    
    Parameters:
    -----------
    size : str
        Size of the dataset ('100k', '1m', '10m', '20m', or 'latest-small')
    data_dir : str
        Directory to save the dataset
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing ratings data
    """
    urls = {
        '100k': 'https://files.grouplens.org/datasets/movielens/ml-100k.zip',
        '1m': 'https://files.grouplens.org/datasets/movielens/ml-1m.zip',
        '10m': 'https://files.grouplens.org/datasets/movielens/ml-10m.zip',
        '20m': 'https://files.grouplens.org/datasets/movielens/ml-20m.zip',
        'latest-small': 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    }
    
    if size not in urls:
        raise ValueError(f"Invalid size. Choose from: {', '.join(urls.keys())}")
    
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Create a subdirectory for this specific dataset
    dataset_dir = os.path.join(data_dir, f'ml-{size}')
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    
    zip_path = os.path.join(data_dir, f'ml-{size}.zip')
    
    # Download if not already exists
    if not os.path.exists(zip_path):
        content = download_file(urls[size], zip_path)
        if not content:
            return None
    
    # Extract ZIP file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    
    # Load ratings data
    if size == '100k':
        ratings_path = os.path.join(data_dir, 'ml-100k', 'u.data')
        ratings = pd.read_csv(ratings_path, sep='\t', 
                             names=['user_id', 'item_id', 'rating', 'timestamp'])
        
        # Load movie data
        movies_path = os.path.join(data_dir, 'ml-100k', 'u.item')
        movies = pd.read_csv(movies_path, sep='|', encoding='latin-1',
                           names=['item_id', 'title', 'release_date', 'video_release_date',
                                 'IMDb_URL'] + [f'genre_{i}' for i in range(19)])
        
        # Extract genre information
        genre_cols = [col for col in movies.columns if 'genre_' in col]
        movies['genres'] = movies[genre_cols].apply(lambda x: '/'.join([g for i, g in 
                                                                       zip(x, ['Action', 'Adventure', 'Animation',
                                                                             'Children\'s', 'Comedy', 'Crime',
                                                                             'Documentary', 'Drama', 'Fantasy',
                                                                             'Film-Noir', 'Horror', 'Musical',
                                                                             'Mystery', 'Romance', 'Sci-Fi',
                                                                             'Thriller', 'War', 'Western', 'Unknown'])
                                                                       if i == 1]), axis=1)
    else:
        # For other sizes, load in appropriate format
        if size in ['1m', '10m', '20m']:
            separator = '::'
            header = None
            names = ['user_id', 'item_id', 'rating', 'timestamp']
        else:  # latest-small
            separator = ','
            header = 0
            names = None
        
        ratings_path = os.path.join(data_dir, f'ml-{size}', 'ratings.dat' if size in ['1m', '10m'] else 'ratings.csv')
        ratings = pd.read_csv(ratings_path, sep=separator, header=header, names=names)
        
        # Load movie data
        movies_path = os.path.join(data_dir, f'ml-{size}', 'movies.dat' if size in ['1m', '10m'] else 'movies.csv')
        if size in ['1m', '10m']:
            movies = pd.read_csv(movies_path, sep=separator, header=header, 
                               names=['item_id', 'title', 'genres'], encoding='latin-1')
        else:
            movies = pd.read_csv(movies_path, encoding='latin-1')
    
    print(f"Loaded MovieLens {size} dataset")
    print(f"Ratings: {len(ratings)} | Users: {len(ratings['user_id'].unique())} | Items: {len(ratings['item_id'].unique())}")
    
    return ratings, movies

def download_lastfm(data_dir='data'):
    """
    Download Last.fm dataset
    
    Parameters:
    -----------
    data_dir : str
        Directory to save the dataset
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing user-artist play counts
    """
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    dataset_dir = os.path.join(data_dir, 'lastfm-360k')
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    
    # Last.fm 360K URL
    url = 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-360K.tar.gz'
    
    # Check if we already have the data
    user_artists_path = os.path.join(dataset_dir, 'usersha1-artmbid-artname-plays.tsv')
    if os.path.exists(user_artists_path):
        print("Last.fm dataset already exists, loading...")
    else:
        # This is a complex download and extraction process
        # For simplicity, we'll just notify the user to download manually
        print("The Last.fm dataset is large and complex to extract automatically.")
        print(f"Please download from {url} and extract to {dataset_dir}")
        print("Expected file structure: usersha1-artmbid-artname-plays.tsv")
        return None, None
    
    # Load user-artist data
    try:
        user_artists = pd.read_csv(user_artists_path, sep='\t', 
                                  names=['user_id', 'artist_id', 'artist_name', 'plays'])
        
        # Clean up data
        user_artists = user_artists.dropna(subset=['user_id', 'artist_name', 'plays'])
        
        # For artists without IDs, use name as ID
        user_artists['artist_id'] = user_artists['artist_id'].fillna(user_artists['artist_name'])
        
        print(f"Loaded Last.fm dataset")
        print(f"Records: {len(user_artists)} | Users: {len(user_artists['user_id'].unique())} | Artists: {len(user_artists['artist_id'].unique())}")
        
        # Last.fm doesn't have separate metadata, but we can create an artists dataframe
        artists = user_artists[['artist_id', 'artist_name']].drop_duplicates()
        
        return user_artists, artists
    except Exception as e:
        print(f"Error loading Last.fm dataset: {e}")
        return None, None

def download_amazon_reviews(category='Digital_Music', data_dir='data'):
    """
    Download Amazon reviews dataset for a specific category
    
    Parameters:
    -----------
    category : str
        Product category ('Digital_Music', 'Books', 'Movies_and_TV', etc.)
    data_dir : str
        Directory to save the dataset
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing ratings data
    """
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    dataset_dir = os.path.join(data_dir, f'amazon-{category.lower()}')
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)
    
    # Amazon reviews URL - we'll use the 5-core datasets which are smaller
    url = f'https://jmcauley.ucsd.edu/data/amazon/subsets/{category}_5.json.gz'
    
    # Check if we already have the data
    json_path = os.path.join(dataset_dir, f'{category}_5.json.gz')
    if os.path.exists(json_path):
        print(f"Amazon {category} dataset already exists, loading...")
    else:
        # For simplicity, we'll just notify the user to download manually
        # These files can be large and complex to process
        print(f"The Amazon {category} dataset needs to be downloaded manually.")
        print(f"Please download from {url} and save to {json_path}")
        print("Then use the pandas read_json function with lines=True and compression='gzip'")
        return None, None
    
    # Load reviews data
    try:
        # This is a simplification - in reality, each line is a separate JSON object
        reviews = pd.read_json(json_path, lines=True, compression='gzip')
        
        # Rename columns to match our standard format
        reviews = reviews.rename(columns={
            'reviewerID': 'user_id',
            'asin': 'item_id',
            'overall': 'rating',
            'unixReviewTime': 'timestamp'
        })
        
        # Extract needed columns
        reviews = reviews[['user_id', 'item_id', 'rating', 'timestamp']]
        
        print(f"Loaded Amazon {category} dataset")
        print(f"Reviews: {len(reviews)} | Users: {len(reviews['user_id'].unique())} | Items: {len(reviews['item_id'].unique())}")
        
        # Amazon datasets don't have separate metadata files in this format
        # For simplicity, we'll return None for items
        return reviews, None
    except Exception as e:
        print(f"Error loading Amazon {category} dataset: {e}")
        return None, None

def prepare_dataset(ratings, test_size=0.2, implicit=False, threshold=None):
    """
    Prepare dataset for training and testing
    
    Parameters:
    -----------
    ratings : pandas.DataFrame
        DataFrame containing ratings data
    test_size : float
        Proportion of data to use for testing
    implicit : bool
        Whether to convert to implicit feedback
    threshold : float, optional
        Threshold for implicit conversion (ratings above this are positive)
        
    Returns:
    --------
    dict
        Dictionary containing train and test data in different formats
    """
    # Make a copy to avoid modifying the original
    ratings_df = ratings.copy()
    
    # If implicit feedback is requested, convert ratings to binary
    if implicit:
        if threshold is None:
            # If no threshold provided, use the median rating
            threshold = ratings_df['rating'].median()
        
        print(f"Converting to implicit feedback (threshold = {threshold})")
        ratings_df['rating'] = (ratings_df['rating'] >= threshold).astype(float)
    
    # Split into train and test sets (stratify by user to ensure all users have both train and test items)
    user_encoder = LabelEncoder()
    ratings_df['user_idx'] = user_encoder.fit_transform(ratings_df['user_id'])
    
    try:
        train_df, test_df = train_test_split(
            ratings_df, test_size=test_size, stratify=ratings_df['user_idx'], random_state=42
        )
    except ValueError:
        # If stratification fails (usually with very unbalanced data), try without stratification
        print("Stratified split failed, falling back to random split")
        train_df, test_df = train_test_split(
            ratings_df, test_size=test_size, random_state=42
        )
    
    # Create user and item encoders
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    
    # Combine train and test to ensure consistent encoding
    combined = pd.concat([train_df, test_df])
    user_encoder.fit(combined['user_id'])
    item_encoder.fit(combined['item_id'])
    
    # Create various formats of the data for different algorithms
    
    # 1. Pandas DataFrames with original IDs
    train_df = train_df.sort_values(['user_id', 'timestamp'])
    test_df = test_df.sort_values(['user_id', 'timestamp'])
    
    # 2. Pandas DataFrames with encoded IDs
    train_df_encoded = train_df.copy()
    test_df_encoded = test_df.copy()
    
    train_df_encoded['user_idx'] = user_encoder.transform(train_df['user_id'])
    train_df_encoded['item_idx'] = item_encoder.transform(train_df['item_id'])
    test_df_encoded['user_idx'] = user_encoder.transform(test_df['user_id'])
    test_df_encoded['item_idx'] = item_encoder.transform(test_df['item_id'])
    
    # 3. Dictionaries for easier lookups
    # User -> items in train
    train_user_items = defaultdict(list)
    for _, row in train_df.iterrows():
        train_user_items[row['user_id']].append((row['item_id'], row['rating']))
    
    # User -> items in test
    test_user_items = defaultdict(list)
    for _, row in test_df.iterrows():
        test_user_items[row['user_id']].append((row['item_id'], row['rating']))
    
    # 4. Surprise format
    reader = Reader(rating_scale=(ratings_df['rating'].min(), ratings_df['rating'].max()))
    train_surprise = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    train_surprise_full = train_surprise.build_full_trainset()
    
    # Convert test data to Surprise format (list of tuples)
    test_surprise = [(row['user_id'], row['item_id'], row['rating']) for _, row in test_df.iterrows()]
    
    # Return all formats
    return {
        'train_df': train_df,
        'test_df': test_df,
        'train_df_encoded': train_df_encoded,
        'test_df_encoded': test_df_encoded,
        'train_user_items': train_user_items,
        'test_user_items': test_user_items,
        'train_surprise': train_surprise,
        'train_surprise_full': train_surprise_full,
        'test_surprise': test_surprise,
        'user_encoder': user_encoder,
        'item_encoder': item_encoder,
        'n_users': len(user_encoder.classes_),
        'n_items': len(item_encoder.classes_),
    }

# =============================================================================
# Recommender Algorithms
# =============================================================================

class PopularityRecommender:
    """
    Popularity-based recommender system that recommends the most popular items
    """
    
    def __init__(self):
        self.item_popularity = None
        self.items = None
    
    def fit(self, train_df):
        """
        Fit the recommender model to training data
        
        Parameters:
        -----------
        train_df : pandas.DataFrame
            DataFrame with columns: user_id, item_id, rating
        """
        # Calculate item popularity as the frequency of ratings
        self.item_popularity = train_df['item_id'].value_counts().to_dict()
        self.items = list(self.item_popularity.keys())
        self.items.sort(key=lambda x: self.item_popularity[x], reverse=True)
        return self
    
    def recommend(self, user_id, train_user_items, n=10):
        """
        Recommend items for a user
        
        Parameters:
        -----------
        user_id : int or str
            User ID
        train_user_items : dict
            Dictionary mapping user_id to list of (item_id, rating) tuples of items they've rated
        n : int
            Number of recommendations to return
            
        Returns:
        --------
        list
            List of top n recommended item IDs
        """
        # Get items the user has already rated
        rated_items = [item for item, _ in train_user_items.get(user_id, [])]
        
        # Recommend the most popular items that the user hasn't rated yet
        recommendations = []
        for item in self.items:
            if item not in rated_items:
                recommendations.append(item)
                if len(recommendations) >= n:
                    break
        
        return recommendations

    def predict(self, user_ids, item_ids):
        """
        Predict ratings for given user-item pairs
        
        Parameters:
        -----------
        user_ids : array-like
            User IDs
        item_ids : array-like
            Item IDs
            
        Returns:
        --------
        numpy.ndarray
            Predicted ratings
        """
        # For popularity-based, we use the relative popularity as prediction
        # Normalize to [0,1] range
        max_pop = max(self.item_popularity.values())
        
        predictions = []
        for item in item_ids:
            if item in self.item_popularity:
                # Scale by max popularity to get a relative score
                predictions.append(self.item_popularity[item] / max_pop)
            else:
                predictions.append(0.0)
        
        return np.array(predictions)

class RandomRecommender:
    """
    Random recommender system that recommends items at random
    """
    
    def __init__(self, random_state=None):
        self.random_state = random_state
        self.rng = np.random.RandomState(random_state)
        self.items = None
    
    def fit(self, train_df):
        """
        Fit the recommender model to training data
        
        Parameters:
        -----------
        train_df : pandas.DataFrame
            DataFrame with columns: user_id, item_id, rating
        """
        # Store the list of all items
        self.items = train_df['item_id'].unique()
        return self
    
    def recommend(self, user_id, train_user_items, n=10):
        """
        Recommend items for a user
        
        Parameters:
        -----------
        user_id : int or str
            User ID
        train_user_items : dict
            Dictionary mapping user_id to list of (item_id, rating) tuples of items they've rated
        n : int
            Number of recommendations to return
            
        Returns:
        --------
        list
            List of top n recommended item IDs
        """
        # Get items the user has already rated
        rated_items = [item for item, _ in train_user_items.get(user_id, [])]
        
        # Get candidate items (those not yet rated by the user)
        candidate_items = [item for item in self.items if item not in rated_items]
        
        # If there are no candidates or fewer than requested, return what we have
        if len(candidate_items) <= n:
            return candidate_items
        
        # Randomly select n items
        return self.rng.choice(candidate_items, size=n, replace=False)

    def predict(self, user_ids, item_ids):
        """
        Predict ratings for given user-item pairs
        
        Parameters:
        -----------
        user_ids : array-like
            User IDs
        item_ids : array-like
            Item IDs
            
        Returns:
        --------
        numpy.ndarray
            Predicted ratings
        """
        # For random recommender, we just return random predictions
        return self.rng.rand(len(user_ids))

class MatrixFactorizationRecommender:
    """
    Matrix Factorization recommender using Surprise's SVD algorithm
    """
    
    def __init__(self, n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=None):
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr_all = lr_all
        self.reg_all = reg_all
        self.random_state = random_state
        self.model = SVD(
            n_factors=n_factors,
            n_epochs=n_epochs,
            lr_all=lr_all,
            reg_all=reg_all,
            random_state=random_state
        )
    
    def fit(self, trainset):
        """
        Fit the recommender model to training data
        
        Parameters:
        -----------
        trainset : surprise.Trainset
            Surprise Trainset object
        """
        self.model.fit(trainset)
        return self
    
    def recommend(self, user_id, train_user_items, n=10):
        """
        Recommend items for a user
        
        Parameters:
        -----------
        user_id : int or str
            User ID
        train_user_items : dict
            Dictionary mapping user_id to list of (item_id, rating) tuples of items they've rated
        n : int
            Number of recommendations to return
            
        Returns:
        --------
        list
            List of top n recommended item IDs
        """
        # Get items the user has already rated
        rated_items = [item for item, _ in train_user_items.get(user_id, [])]
        
        # Get all items from the model
        all_items = set(self.model.trainset._raw2inner_id_items.keys())
        
        # Get candidate items (those not yet rated by the user)
        candidate_items = [item for item in all_items if item not in rated_items]
        
        # Calculate predictions for all candidate items
        predictions = [(item, self.model.predict(user_id, item).est) for item in candidate_items]
        
        # Sort by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)
        
        # Return top n items
        return [item for item, _ in predictions[:n]]

    def predict(self, user_ids, item_ids):
        """
        Predict ratings for given user-item pairs
        
        Parameters:
        -----------
        user_ids : array-like
            User IDs
        item_ids : array-like
            Item IDs
            
        Returns:
        --------
        numpy.ndarray
            Predicted ratings
        """
        predictions = []
        for user, item in zip(user_ids, item_ids):
            try:
                pred = self.model.predict(user, item).est
                predictions.append(pred)
            except Exception:
                # If user or item is unknown, predict the global mean
                predictions.append(self.model.trainset.global_mean)
        
        return np.array(predictions)

class ItemKNNRecommender:
    """
    Item-based K-Nearest Neighbors recommender using Surprise
    """
    
    def __init__(self, k=20, min_k=1, sim_options=None, random_state=None):
        if sim_options is None:
            # Default similarity options
            sim_options = {
                'name': 'cosine',
                'user_based': False  # Item-based similarity
            }
        
        self.k = k
        self.min_k = min_k
        self.sim_options = sim_options
        self.random_state = random_state
        self.model = KNNBasic(
            k=k,
            min_k=min_k,
            sim_options=sim_options,
            random_state=random_state
        )
    
    def fit(self, trainset):
        """
        Fit the recommender model to training data
        
        Parameters:
        -----------
        trainset : surprise.Trainset
            Surprise Trainset object
        """
        self.model.fit(trainset)
        return self
    
    def recommend(self, user_id, train_user_items, n=10):
        """
        Recommend items for a user
        
        Parameters:
        -----------
        user_id : int or str
            User ID
        train_user_items : dict
            Dictionary mapping user_id to list of (item_id, rating) tuples of items they've rated
        n : int
            Number of recommendations to return
            
        Returns:
        --------
        list
            List of top n recommended item IDs
        """
        # Get items the user has already rated
        rated_items = [item for item, _ in train_user_items.get(user_id, [])]
        
        # Get all items from the model
        all_items = set(self.model.trainset._raw2inner_id_items.keys())
        
        # Get candidate items (those not yet rated by the user)
        candidate_items = [item for item in all_items if item not in rated_items]
        
        # Calculate predictions for all candidate items
        predictions = [(item, self.model.predict(user_id, item).est) for item in candidate_items]
        
        # Sort by predicted rating
        predictions.sort(key=lambda x: x[1], reverse=True)
        
        # Return top n items
        return [item for item, _ in predictions[:n]]

    def predict(self, user_ids, item_ids):
        """
        Predict ratings for given user-item pairs
        
        Parameters:
        -----------
        user_ids : array-like
            User IDs
        item_ids : array-like
            Item IDs
            
        Returns:
        --------
        numpy.ndarray
            Predicted ratings
        """
        predictions = []
        for user, item in zip(user_ids, item_ids):
            try:
                pred = self.model.predict(user, item).est
                predictions.append(pred)
            except Exception:
                # If user or item is unknown, predict the global mean
                predictions.append(self.model.trainset.global_mean)
        
        return np.array(predictions)

# =============================================================================
# Evaluation Metrics
# =============================================================================

# Rating Prediction Metrics

def rmse(y_true, y_pred):
    """
    Calculate Root Mean Square Error
    
    Parameters:
    -----------
    y_true : array-like
        Actual ratings
    y_pred : array-like
        Predicted ratings
        
    Returns:
    --------
    float
        RMSE value
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    """
    Calculate Mean Absolute Error
    
    Parameters:
    -----------
    y_true : array-like
        Actual ratings
    y_pred : array-like
        Predicted ratings
        
    Returns:
    --------
    float
        MAE value
    """
    return mean_absolute_error(y_true, y_pred)

# Ranking Metrics

def precision_at_k(recommended_items, relevant_items, k=10):
    """
    Calculate Precision@k
    
    Parameters:
    -----------
    recommended_items : list
        List of recommended item IDs, ordered by relevance
    relevant_items : list or set
        Set of item IDs that are actually relevant
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Precision@k value
    """
    if len(recommended_items) == 0:
        return 0.0
    
    # Ensure k doesn't exceed the length of recommended items
    k = min(k, len(recommended_items))
    recommended_k = recommended_items[:k]
    
    # Convert relevant_items to set for O(1) lookups
    relevant_items_set = set(relevant_items)
    
    # Count relevant items in top-k recommendations
    relevant_and_recommended = sum(1 for item in recommended_k if item in relevant_items_set)
    
    return relevant_and_recommended / k

def recall_at_k(recommended_items, relevant_items, k=10):
    """
    Calculate Recall@k
    
    Parameters:
    -----------
    recommended_items : list
        List of recommended item IDs, ordered by relevance
    relevant_items : list or set
        Set of item IDs that are actually relevant
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Recall@k value
    """
    if len(relevant_items) == 0:
        return 0.0
    
    # Ensure k doesn't exceed the length of recommended items
    k = min(k, len(recommended_items))
    recommended_k = recommended_items[:k]
    
    # Convert relevant_items to set for O(1) lookups
    relevant_items_set = set(relevant_items)
    
    # Count relevant items in top-k recommendations
    relevant_and_recommended = sum(1 for item in recommended_k if item in relevant_items_set)
    
    return relevant_and_recommended / len(relevant_items)

def f1_at_k(recommended_items, relevant_items, k=10):
    """
    Calculate F1-score@k
    
    Parameters:
    -----------
    recommended_items : list
        List of recommended item IDs, ordered by relevance
    relevant_items : list or set
        Set of item IDs that are actually relevant
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        F1-score@k value
    """
    p = precision_at_k(recommended_items, relevant_items, k)
    r = recall_at_k(recommended_items, relevant_items, k)
    
    if p + r == 0:
        return 0.0
        
    return 2 * (p * r) / (p + r)

def average_precision_at_k(recommended_items, relevant_items, k=10):
    """
    Calculate Average Precision at k for a single user
    
    Parameters:
    -----------
    recommended_items : list
        List of recommended item IDs, ordered by relevance
    relevant_items : list or set
        Set of item IDs that are actually relevant
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Average Precision@k value
    """
    if not relevant_items:
        return 0.0
    
    # Ensure k doesn't exceed the length of recommended items
    k = min(k, len(recommended_items))
    recommended_k = recommended_items[:k]
    
    # Convert relevant_items to set for O(1) lookups
    relevant_items_set = set(relevant_items)
    
    hits = 0
    sum_precisions = 0
    
    for i, item in enumerate(recommended_k):
        if item in relevant_items_set:
            hits += 1
            # Precision at current hit position
            precision_at_i = hits / (i + 1)
            sum_precisions += precision_at_i
    
    return sum_precisions / min(len(relevant_items_set), k)

def mean_average_precision(recommendations_by_user, relevant_items_by_user, k=10):
    """
    Calculate Mean Average Precision at k
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    relevant_items_by_user : dict
        Dictionary mapping user IDs to their set of relevant items
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        MAP@k value
    """
    aps = []
    
    for user, recommendations in recommendations_by_user.items():
        if user in relevant_items_by_user:
            relevant_items = [item for item, _ in relevant_items_by_user[user]]
            if relevant_items:  # Only consider users with at least one relevant item
                ap = average_precision_at_k(recommendations, relevant_items, k)
                aps.append(ap)
    
    return np.mean(aps) if aps else 0

def ndcg_at_k(recommended_items, relevant_items_with_ratings, k=10):
    """
    Calculate normalized Discounted Cumulative Gain (nDCG) at k
    
    Parameters:
    -----------
    recommended_items : list
        List of recommended item IDs, ordered by predicted relevance
    relevant_items_with_ratings : list of tuples
        List of (item_id, rating) tuples representing relevant items and their ratings
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        nDCG@k value
    """
    if not relevant_items_with_ratings:
        return 0.0
    
    # Ensure k doesn't exceed the length of recommended items
    k = min(k, len(recommended_items))
    
    # Create a dictionary mapping item_id to rating
    relevance_scores = {item: rating for item, rating in relevant_items_with_ratings}
    
    # Get relevance scores for recommended items (0 if not in relevant items)
    recommended_relevances = [relevance_scores.get(item, 0) for item in recommended_items[:k]]
    
    # Calculate DCG
    dcg = 0
    for i, rel in enumerate(recommended_relevances):
        dcg += (2 ** rel - 1) / np.log2(i + 2)  # i+2 because i is 0-indexed
    
    # Calculate ideal DCG (for normalization)
    # Sort relevant items by their ratings in descending order
    sorted_relevances = sorted([rating for _, rating in relevant_items_with_ratings], reverse=True)[:k]
    idcg = 0
    for i, rel in enumerate(sorted_relevances):
        idcg += (2 ** rel - 1) / np.log2(i + 2)
    
    # Normalize
    if idcg == 0:
        return 0.0
        
    return dcg / idcg

def mean_ndcg(recommendations_by_user, relevant_items_by_user, k=10):
    """
    Calculate Mean nDCG at k across all users
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    relevant_items_by_user : dict
        Dictionary mapping user IDs to their list of (item_id, rating) tuples
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Mean nDCG@k value
    """
    ndcg_scores = []
    
    for user, recommendations in recommendations_by_user.items():
        if user in relevant_items_by_user:
            relevant_items = relevant_items_by_user[user]
            if relevant_items:  # Only consider users with at least one relevant item
                ndcg = ndcg_at_k(recommendations, relevant_items, k)
                ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores) if ndcg_scores else 0

def hit_rate_at_k(recommendations_by_user, relevant_items_by_user, k=10):
    """
    Calculate Hit Rate at k
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    relevant_items_by_user : dict
        Dictionary mapping user IDs to their set of relevant items
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Hit Rate@k value
    """
    hits = 0
    total_users = 0
    
    for user, recommendations in recommendations_by_user.items():
        if user not in relevant_items_by_user:
            continue
            
        total_users += 1
        relevant_items = [item for item, _ in relevant_items_by_user[user]]
        top_k = recommendations[:k]
        
        if any(item in relevant_items for item in top_k):
            hits += 1
    
    return hits / total_users if total_users > 0 else 0

# Beyond-Accuracy Metrics

def catalog_coverage_at_k(recommendations_by_user, total_items, k=10):
    """
    Calculate Catalog Coverage at k
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    total_items : int or set
        Total number of items or set of all item IDs
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Catalog Coverage@k value
    """
    all_recommended = set()
    
    for user, recommendations in recommendations_by_user.items():
        all_recommended.update(recommendations[:k])
    
    if isinstance(total_items, int):
        return len(all_recommended) / total_items
    else:
        return len(all_recommended) / len(total_items)

def diversity_at_k(recommendations_by_user, item_features, k=10):
    """
    Calculate average intra-list diversity at k
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    item_features : dict
        Dictionary mapping item IDs to their feature vectors
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Average diversity at k
    """
    diversity_scores = []
    
    for user, recommendations in recommendations_by_user.items():
        top_k = recommendations[:k]
        
        if len(top_k) <= 1:
            continue
            
        # Calculate pairwise distances
        distances = []
        for i in range(len(top_k)):
            item_i = top_k[i]
            if item_i not in item_features:
                continue
                
            for j in range(i+1, len(top_k)):
                item_j = top_k[j]
                if item_j not in item_features:
                    continue
                
                # Calculate cosine distance
                try:
                    dist = cosine(item_features[item_i], item_features[item_j])
                    distances.append(dist)
                except:
                    pass
        
        if distances:
            diversity_scores.append(np.mean(distances))
    
    return np.mean(diversity_scores) if diversity_scores else 0

def novelty_at_k(recommendations_by_user, item_popularity, total_users, k=10):
    """
    Calculate average Novelty at k
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    item_popularity : dict
        Dictionary mapping item IDs to their popularity (number of ratings)
    total_users : int
        Total number of users in the system
    k : int
        Number of top recommendations to consider
        
    Returns:
    --------
    float
        Average novelty at k
    """
    novelty_scores = []
    
    for user, recommendations in recommendations_by_user.items():
        top_k = recommendations[:k]
        
        if not top_k:
            continue
            
        # Calculate self-information (novelty) for each item
        item_novelties = []
        for item in top_k:
            if item in item_popularity:
                # Calculate self-information: -log2(popularity)
                pop = item_popularity[item] / total_users
                novelty = -np.log2(pop) if pop > 0 else 0
                item_novelties.append(novelty)
        
        if item_novelties:
            # Average novelty for this user's recommendations
            novelty_scores.append(np.mean(item_novelties))
    
    return np.mean(novelty_scores) if novelty_scores else 0

def evaluate_recommendations(recommendations_by_user, relevant_items_by_user, item_features=None, 
                           item_popularity=None, total_users=None, total_items=None, k_values=None):
    """
    Comprehensive evaluation of recommendations
    
    Parameters:
    -----------
    recommendations_by_user : dict
        Dictionary mapping user IDs to their ordered list of recommended items
    relevant_items_by_user : dict
        Dictionary mapping user IDs to their list of (item_id, rating) tuples
    item_features : dict, optional
        Dictionary mapping item IDs to their feature vectors (for diversity)
    item_popularity : dict, optional
        Dictionary mapping item IDs to their popularity (for novelty)
    total_users : int, optional
        Total number of users (for novelty)
    total_items : int or set, optional
        Total number of items or set of all item IDs (for coverage)
    k_values : list, optional
        List of k values to evaluate at (default: [5, 10, 20])
        
    Returns:
    --------
    dict
        Dictionary of evaluation metrics
    """
    if k_values is None:
        k_values = [5, 10, 20]
    
    results = {}
    
    # For each k value
    for k in k_values:
        k_results = {}
        
        # Precision, Recall, F1
        precisions = []
        recalls = []
        f1_scores = []
        
        for user, recommendations in recommendations_by_user.items():
            if user in relevant_items_by_user:
                relevant_items = [item for item, _ in relevant_items_by_user[user]]
                if relevant_items:
                    p = precision_at_k(recommendations, relevant_items, k)
                    r = recall_at_k(recommendations, relevant_items, k)
                    f1 = f1_at_k(recommendations, relevant_items, k)
                    
                    precisions.append(p)
                    recalls.append(r)
                    f1_scores.append(f1)
        
        k_results['precision'] = np.mean(precisions) if precisions else 0
        k_results['recall'] = np.mean(recalls) if recalls else 0
        k_results['f1'] = np.mean(f1_scores) if f1_scores else 0
        
        # MAP
        k_results['map'] = mean_average_precision(recommendations_by_user, relevant_items_by_user, k)
        
        # nDCG
        k_results['ndcg'] = mean_ndcg(recommendations_by_user, relevant_items_by_user, k)
        
        # Hit Rate
        k_results['hit_rate'] = hit_rate_at_k(recommendations_by_user, relevant_items_by_user, k)
        
        # Catalog Coverage
        if total_items is not None:
            k_results['coverage'] = catalog_coverage_at_k(recommendations_by_user, total_items, k)
        
        # Diversity
        if item_features is not None:
            k_results['diversity'] = diversity_at_k(recommendations_by_user, item_features, k)
        
        # Novelty
        if item_popularity is not None and total_users is not None:
            k_results['novelty'] = novelty_at_k(recommendations_by_user, item_popularity, total_users, k)
        
        results[k] = k_results
    
    return results

def evaluate_rating_prediction(model, testset, format_type='surprise'):
    """
    Evaluate rating prediction
    
    Parameters:
    -----------
    model : object
        Recommender system model with predict method
    testset : list or pandas.DataFrame
        Test set in specified format
    format_type : str
        Format of test set ('surprise', 'dataframe')
        
    Returns:
    --------
    dict
        Dictionary of evaluation metrics
    """
    if format_type == 'surprise':
        # Surprise format: list of (user_id, item_id, rating) tuples
        y_true = []
        y_pred = []
        
        for uid, iid, true_rating in testset:
            pred_rating = model.model.predict(uid, iid).est
            y_true.append(true_rating)
            y_pred.append(pred_rating)
    
    elif format_type == 'dataframe':
        # DataFrame format
        y_true = testset['rating'].values
        y_pred = model.predict(testset['user_id'].values, testset['item_id'].values)
    
    else:
        raise ValueError(f"Unknown format type: {format_type}")
    
    return {
        'rmse': rmse(y_true, y_pred),
        'mae': mae(y_true, y_pred)
    }

def run_experiment(models, dataset, k_values=None, implicit=False, threshold=None):
    """
    Run a comprehensive experiment with multiple models and evaluation metrics
    
    Parameters:
    -----------
    models : list of tuples
        List of (name, model) tuples
    dataset : dict
        Dataset from prepare_dataset function
    k_values : list, optional
        List of k values to evaluate at (default: [5, 10, 20])
    implicit : bool
        Whether the data should be treated as implicit feedback
    threshold : float, optional
        Threshold for implicit conversion
        
    Returns:
    --------
    dict
        Dictionary of results
    """
    if k_values is None:
        k_values = [5, 10, 20]
    
    results = {}
    
    # Compute item popularity (for novelty calculation)
    item_popularity = dataset['train_df']['item_id'].value_counts().to_dict()
    total_users = len(dataset['train_df']['user_id'].unique())
    total_items = len(dataset['train_df']['item_id'].unique())
    
    # If the data is explicit, use ratings as is
    # If implicit, convert test ratings to binary based on threshold
    if implicit:
        if threshold is None:
            threshold = dataset['test_df']['rating'].median()
        
        # Convert test ratings to binary
        for user in dataset['test_user_items']:
            dataset['test_user_items'][user] = [
                (item, 1.0 if rating >= threshold else 0.0)
                for item, rating in dataset['test_user_items'][user]
            ]
    
    # For each model
    for model_name, model in models:
        print(f"\nEvaluating {model_name}...")
        model_results = {'name': model_name}
        
        # Generate recommendations for all users
        recommendations = {}
        
        # Get all users to evaluate (those with items in test set)
        test_users = list(dataset['test_user_items'].keys())
        
        start_time = time.time()
        
        # Generate recommendations for each user
        for user in test_users:
            try:
                recs = model.recommend(user, dataset['train_user_items'], n=max(k_values))
                recommendations[user] = recs
            except Exception as e:
                print(f"  Error generating recommendations for user {user}: {e}")
        
        recommend_time = time.time() - start_time
        model_results['recommend_time'] = recommend_time
        print(f"  Generated recommendations in {recommend_time:.2f} seconds")
        
        # Evaluate recommendations
        start_time = time.time()
        model_results['ranking'] = evaluate_recommendations(
            recommendations,
            dataset['test_user_items'],
            item_popularity=item_popularity,
            total_users=total_users,
            total_items=total_items,
            k_values=k_values
        )
        
        evaluate_time = time.time() - start_time
        model_results['evaluate_time'] = evaluate_time
        print(f"  Evaluated recommendations in {evaluate_time:.2f} seconds")
        
        # Add to results
        results[model_name] = model_results
    
    return results

# =============================================================================
# Visualization Functions
# =============================================================================

def plot_metric_by_k(results, metric, k_values=None, title=None, figsize=(10, 6)):
    """
    Plot a specific metric for different models across k values
    
    Parameters:
    -----------
    results : dict
        Results dictionary from run_experiment
    metric : str
        Metric to plot
    k_values : list, optional
        List of k values to include
    title : str, optional
        Plot title
    figsize : tuple, optional
        Figure size
    """
    if k_values is None:
        # Get k values from the first model's results
        first_model = next(iter(results.values()))
        k_values = sorted(first_model['ranking'].keys())
    
    plt.figure(figsize=figsize)
    
    for model_name, model_results in results.items():
        values = [model_results['ranking'][k].get(metric, 0) for k in k_values]
        plt.plot(k_values, values, marker='o', label=model_name)
    
    plt.xlabel('k (Number of recommendations)')
    plt.ylabel(metric.capitalize())
    
    if title:
        plt.title(title)
    else:
        plt.title(f'{metric.capitalize()} by k')
        
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_metrics_comparison(results, k=10, metrics=None, title=None, figsize=(12, 8)):
    """
    Plot a bar chart comparing different metrics across models
    
    Parameters:
    -----------
    results : dict
        Results dictionary from run_experiment
    k : int
        K value to use for comparison
    metrics : list, optional
        List of metrics to include
    title : str, optional
        Plot title
    figsize : tuple, optional
        Figure size
    """
    if metrics is None:
        # Get metrics from the first model's results
        first_model = next(iter(results.values()))
        metrics = list(first_model['ranking'][k].keys())
    
    # Create a DataFrame for the comparison
    data = []
    model_names = []
    
    for model_name, model_results in results.items():
        model_names.append(model_name)
        data.append([model_results['ranking'][k].get(metric, 0) for metric in metrics])
    
    df = pd.DataFrame(data, index=model_names, columns=metrics)
    
    # Plot
    plt.figure(figsize=figsize)
    df.plot(kind='bar', figsize=figsize)
    
    plt.xlabel('Model')
    plt.ylabel('Value')
    
    if title:
        plt.title(title)
    else:
        plt.title(f'Metrics Comparison (k={k})')
        
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(title='Metric')
    plt.tight_layout()
    plt.show()

def plot_radar_chart(results, k=10, metrics=None, title=None, figsize=(10, 10)):
    """
    Create a radar chart to compare models across multiple metrics
    
    Parameters:
    -----------
    results : dict
        Results dictionary from run_experiment
    k : int
        K value to use for comparison
    metrics : list, optional
        List of metrics to include
    title : str, optional
        Plot title
    figsize : tuple, optional
        Figure size
    """
    if metrics is None:
        # Get metrics from the first model's results
        first_model = next(iter(results.values()))
        metrics = list(first_model['ranking'][k].keys())
    
    # Normalize metrics to [0,1] for fair comparison
    max_values = {}
    for metric in metrics:
        all_values = [model_results['ranking'][k].get(metric, 0) for model_results in results.values()]
        max_values[metric] = max(all_values) if all_values else 1
    
    # Number of metrics
    N = len(metrics)
    
    # Compute angles for each metric
    angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
    angles += angles[:1]  # Close the polygon
    
    # Initialize figure
    fig, ax = plt.subplots(figsize=figsize, subplot_kw=dict(polar=True))
    
    # Plot each model
    for i, (model_name, model_results) in enumerate(results.items()):
        values = []
        for metric in metrics:
            value = model_results['ranking'][k].get(metric, 0)
            # Normalize
            if max_values[metric] > 0:
                value = value / max_values[metric]
            values.append(value)
        
        values += values[:1]  # Close the polygon
        
        # Plot
        ax.plot(angles, values, linewidth=2, label=model_name)
        ax.fill(angles, values, alpha=0.1)
    
    # Set ticks and labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metrics)
    
    # Add legend and title
    ax.legend(loc='upper right')
    
    if title:
        plt.title(title)
    else:
        plt.title(f'Model Comparison (k={k})')
    
    plt.tight_layout()
    plt.show()

def plot_metrics_heatmap(results, k=10, metrics=None, title=None, figsize=(12, 8)):
    """
    Create a heatmap to compare models across multiple metrics
    
    Parameters:
    -----------
    results : dict
        Results dictionary from run_experiment
    k : int
        K value to use for comparison
    metrics : list, optional
        List of metrics to include
    title : str, optional
        Plot title
    figsize : tuple, optional
        Figure size
    """
    if metrics is None:
        # Get metrics from the first model's results
        first_model = next(iter(results.values()))
        metrics = list(first_model['ranking'][k].keys())
    
    # Create a DataFrame for the heatmap
    data = []
    model_names = []
    
    for model_name, model_results in results.items():
        model_names.append(model_name)
        data.append([model_results['ranking'][k].get(metric, 0) for metric in metrics])
    
    df = pd.DataFrame(data, index=model_names, columns=metrics)
    
    # Plot
    plt.figure(figsize=figsize)
    sns.heatmap(df, annot=True, cmap='viridis', fmt='.3f', linewidths=.5)
    
    if title:
        plt.title(title)
    else:
        plt.title(f'Metrics Comparison Heatmap (k={k})')
        
    plt.tight_layout()
    plt.show()

def plot_results_summary(results, k=10, metrics=None):
    """
    Create multiple visualizations of results
    
    Parameters:
    -----------
    results : dict
        Results dictionary from run_experiment
    k : int
        K value to use for comparison
    metrics : list, optional
        List of metrics to include
    """
    if metrics is None:
        # Get metrics from the first model's results
        first_model = next(iter(results.values()))
        metrics = list(first_model['ranking'][k].keys())
    
    # Plot individual metrics by k
    for metric in metrics:
        plot_metric_by_k(results, metric, title=f'{metric.capitalize()} by k')
    
    # Plot metrics comparison
    plot_metrics_comparison(results, k, metrics)
    
    # Plot radar chart
    plot_radar_chart(results, k, metrics)
    
    # Plot heatmap
    plot_metrics_heatmap(results, k, metrics)

# =============================================================================
# Main Function
# =============================================================================

def main(args):
    """
    Main function to run the script
    
    Parameters:
    -----------
    args : argparse.Namespace
        Command line arguments
    """
    print("\n" + "="*80)
    print(f"Recommender System Evaluation - Dataset: {args.dataset}, Size: {args.size}")
    print("="*80 + "\n")
    
    # Create data directory if it doesn't exist
    if not os.path.exists(args.data_dir):
        os.makedirs(args.data_dir)
    
    # Download dataset
    if args.dataset == 'movielens':
        ratings, items = download_movielens(args.size, args.data_dir)
    elif args.dataset == 'lastfm':
        ratings, items = download_lastfm(args.data_dir)
    elif args.dataset == 'amazon':
        ratings, items = download_amazon_reviews(args.size, args.data_dir)
    else:
        raise ValueError(f"Unknown dataset: {args.dataset}")
    
    if ratings is None:
        print("Failed to load dataset. Exiting.")
        return
    
    # Prepare dataset (implicit or explicit)
    dataset = prepare_dataset(ratings, args.test_size, args.implicit, args.threshold)
    
    # Create item features dict if items dataframe is available
    item_features = None
    if items is not None and 'genres' in items.columns:
        # For MovieLens, use genres as features
        # Create one-hot encoding of genres
        genres = set()
        for genre_list in items['genres']:
            for genre in genre_list.split('|'):
                genres.add(genre)
        
        item_features = {}
        for _, row in items.iterrows():
            item_id = row['item_id']
            genre_list = row['genres'].split('|')
            # Create binary vector
            feature_vector = np.zeros(len(genres))
            for i, genre in enumerate(sorted(genres)):
                if genre in genre_list:
                    feature_vector[i] = 1
            item_features[item_id] = feature_vector
    
    # Initialize models
    models = []
    
    # Add requested models
    if 'popularity' in args.models:
        popularity_model = PopularityRecommender()
        popularity_model.fit(dataset['train_df'])
        models.append(('Popularity', popularity_model))
    
    if 'random' in args.models:
        random_model = RandomRecommender(random_state=42)
        random_model.fit(dataset['train_df'])
        models.append(('Random', random_model))
    
    if 'mf' in args.models:
        mf_model = MatrixFactorizationRecommender(
            n_factors=args.factors,
            n_epochs=args.epochs,
            random_state=42
        )
        mf_model.fit(dataset['train_surprise_full'])
        models.append(('MatrixFactorization', mf_model))
    
    if 'itemknn' in args.models:
        knn_model = ItemKNNRecommender(
            k=args.neighbors,
            random_state=42
        )
        knn_model.fit(dataset['train_surprise_full'])
        models.append(('ItemKNN', knn_model))
    
    # Run experiment
    results = run_experiment(
        models,
        dataset,
        k_values=args.k_values,
        implicit=args.implicit,
        threshold=args.threshold
    )
    
    # Plot results
    plot_results_summary(results, k=10)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Recommender System Evaluation Script')
    
    # Dataset parameters
    parser.add_argument('--dataset', type=str, default='movielens',
                       choices=['movielens', 'lastfm', 'amazon'],
                       help='Dataset to use')
    parser.add_argument('--size', type=str, default='100k',
                       help='Size/variant of the dataset')
    parser.add_argument('--data_dir', type=str, default='data',
                       help='Directory to store datasets')
    
    # Experiment parameters
    parser.add_argument('--models', nargs='+', default=['popularity', 'random', 'mf', 'itemknn'],
                       choices=['popularity', 'random', 'mf', 'itemknn'],
                       help='Models to evaluate')
    parser.add_argument('--k_values', nargs='+', type=int, default=[5, 10, 20],
                       help='K values for evaluation')
    parser.add_argument('--test_size', type=float, default=0.2,
                       help='Proportion of data to use for testing')
    parser.add_argument('--implicit', action='store_true',
                       help='Treat as implicit feedback data')
    parser.add_argument('--threshold', type=float, default=None,
                       help='Threshold for implicit conversion')
    
    # Model parameters
    parser.add_argument('--factors', type=int, default=100,
                       help='Number of factors for matrix factorization')
    parser.add_argument('--epochs', type=int, default=20,
                       help='Number of epochs for training')
    parser.add_argument('--neighbors', type=int, default=20,
                       help='Number of neighbors for KNN')
    
    args = parser.parse_args()
    
    main(args)



ModuleNotFoundError: No module named 'surprise'