In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

### Training

In [2]:
anime_df = pd.read_csv('anime.csv')
rating_df = pd.read_csv('rating.csv')
rating_df = rating_df[rating_df['rating'] != -1]  # Remove -1 ratings (unknown)
rating_df = rating_df.drop_duplicates(['user_id', 'anime_id'])  # Remove duplicate ratings


In [3]:
class AnimeRecommender:
    def __init__(self, rating_df, anime_df):
        self.rating_df = rating_df
        self.anime_df = anime_df
        
    def preprocess_data(self):
        """Preprocess the data to improve recommendation quality"""
        # Filter users who have rated a minimum number of anime
        user_counts = self.rating_df['user_id'].value_counts()
        valid_users = user_counts[user_counts >= 5].index
        
        # Filter anime that have received a minimum number of ratings
        anime_counts = self.rating_df['anime_id'].value_counts()
        valid_anime = anime_counts[anime_counts >= 10].index
        
        # Filter the ratings dataframe
        filtered_ratings = self.rating_df[
            (self.rating_df['user_id'].isin(valid_users)) & 
            (self.rating_df['anime_id'].isin(valid_anime))
        ]
        
        # Normalize ratings per user
        user_mean = filtered_ratings.groupby('user_id')['rating'].transform('mean')
        user_std = filtered_ratings.groupby('user_id')['rating'].transform('std')
        filtered_ratings['normalized_rating'] = (filtered_ratings['rating'] - user_mean) / user_std.fillna(1)
        
        return filtered_ratings
        
    def train(self, train_data):
        """Train the recommender system using only training data"""
        self.train_df = train_data
        
        # Build matrix using normalized ratings
        self.user_anime_matrix = self.train_df.pivot(
            index='user_id',
            columns='anime_id',
            values='normalized_rating'
        )
        
        # Fill NaN with 0 after pivoting
        self.user_anime_matrix = self.user_anime_matrix.fillna(0)
        
        # Convert to sparse matrix
        self.user_anime_sparse = csr_matrix(self.user_anime_matrix.values)
        
        # Fit the nearest neighbors model with more neighbors
        self.model = NearestNeighbors(
            metric='cosine',
            algorithm='brute',
            n_neighbors=20  # Increased from 10
        )
        self.model.fit(self.user_anime_sparse)
        
        # Store user means for denormalization
        self.user_means = self.train_df.groupby('user_id')['rating'].mean()
        self.user_stds = self.train_df.groupby('user_id')['rating'].std()
    
    def predict_rating(self, user_id, anime_id):
        """Predict rating for a given user-anime pair"""
        try:
            # Find similar users
            user_index = self.user_anime_matrix.index.get_loc(user_id)
            user_vector = self.user_anime_sparse[user_index]
            distances, indices = self.model.kneighbors(user_vector.reshape(1, -1))
            
            # Convert distances to weights
            weights = 1 / (distances.flatten() + 1e-6)
            
            # Get similar users
            similar_users = self.user_anime_matrix.index[indices.flatten()]
            
            # Get ratings from similar users for the target anime
            similar_ratings = self.train_df[
                (self.train_df['user_id'].isin(similar_users)) & 
                (self.train_df['anime_id'] == anime_id)
            ]
            
            if len(similar_ratings) == 0:
                return self.user_means[user_id]
            
            # Calculate weighted average rating
            weighted_sum = 0
            weight_sum = 0
            
            for idx, rating in enumerate(similar_ratings['rating']):
                weighted_sum += rating * weights[idx]
                weight_sum += weights[idx]
            
            if weight_sum == 0:
                return self.user_means[user_id]
                
            predicted_rating = weighted_sum / weight_sum
            
            # Clip predictions to valid range
            return np.clip(predicted_rating, 1, 10)
            
        except Exception as e:
            # Return user's mean rating if prediction fails
            return self.user_means.get(user_id, 5)

def evaluate_recommender(rating_df, anime_df, sample_size=20000, test_size=0.33, random_state=42):
    # Initialize recommender
    recommender = AnimeRecommender(rating_df, anime_df)
    
    # Preprocess the data
    processed_ratings = recommender.preprocess_data()
    
    # Take a sample if specified
    if sample_size and sample_size < len(processed_ratings):
        processed_ratings = processed_ratings.sample(sample_size, random_state=random_state)
    
    # Split the data
    train_df, test_df = train_test_split(
        processed_ratings,
        test_size=test_size,
        random_state=random_state
    )
    
    # Train the recommender
    recommender.train(train_df)
    
    # Predict ratings for the test set
    print("Evaluating recommendations...")
    tqdm.pandas()
    test_df['predicted_rating'] = test_df.progress_apply(
        lambda row: recommender.predict_rating(row['user_id'], row['anime_id']),
        axis=1
    )
    
    # Calculate metrics
    mae = mean_absolute_error(test_df['rating'], test_df['predicted_rating'])
    rmse = np.sqrt(np.mean((test_df['rating'] - test_df['predicted_rating'])**2))
    
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
    
    return recommender, mae, rmse

In [4]:
class HybridAnimeRecommender:
    def __init__(self, rating_df, anime_df):
        self.rating_df = rating_df
        self.anime_df = anime_df
        
    def preprocess_data(self):
        """Preprocess the data to improve recommendation quality"""
        # Increase minimum ratings thresholds for better reliability
        user_counts = self.rating_df['user_id'].value_counts()
        valid_users = user_counts[user_counts >= 10].index  # Increased from 5
        
        anime_counts = self.rating_df['anime_id'].value_counts()
        valid_anime = anime_counts[anime_counts >= 20].index  # Increased from 10
        
        # Filter the ratings dataframe
        filtered_ratings = self.rating_df[
            (self.rating_df['user_id'].isin(valid_users)) & 
            (self.rating_df['anime_id'].isin(valid_anime))
        ]
        
        # Enhanced normalization with outlier handling
        user_mean = filtered_ratings.groupby('user_id')['rating'].transform('mean')
        user_std = filtered_ratings.groupby('user_id')['rating'].transform('std')
        
        # Handle zero standard deviation case more gracefully
        user_std = user_std.replace(0, 1)
        
        # Clip normalized ratings to reduce impact of extreme values
        filtered_ratings['normalized_rating'] = np.clip(
            (filtered_ratings['rating'] - user_mean) / user_std,
            -3,  # Lower bound: 3 standard deviations
            3    # Upper bound: 3 standard deviations
        )
        
        return filtered_ratings
    
    def train(self, train_data):
        """Train the recommender system using only training data"""
        self.train_df = train_data
        
        # Build user-anime matrix using normalized ratings
        self.user_anime_matrix = self.train_df.pivot(
            index='user_id',
            columns='anime_id',
            values='normalized_rating'
        ).fillna(0)
        
        # Apply TruncatedSVD for dimensionality reduction
        n_components = min(50, min(self.user_anime_matrix.shape) - 1)
        self.svd = TruncatedSVD(n_components=n_components, random_state=42)
        user_anime_reduced = self.svd.fit_transform(self.user_anime_matrix)
        
        self.user_anime_sparse = csr_matrix(user_anime_reduced)
        
        self.model = NearestNeighbors(
            metric='cosine',
            algorithm='brute',
            n_neighbors=30 
        )
        self.model.fit(self.user_anime_sparse)
        
        # Store user statistics
        self.user_means = self.train_df.groupby('user_id')['rating'].mean()
        self.user_stds = self.train_df.groupby('user_id')['rating'].std().fillna(1)
        
        # Enhanced genre processing
        self.anime_df['genre'] = self.anime_df['genre'].fillna('')
        self.vectorizer = TfidfVectorizer(  # Changed from CountVectorizer
            tokenizer=lambda x: x.split(', '),
            min_df=2  # Ignore very rare genres
        )
        self.genre_matrix = self.vectorizer.fit_transform(self.anime_df['genre'])
        
        # Calculate global statistics
        self.global_mean = self.train_df['rating'].mean()
        self.global_std = self.train_df['rating'].std()
    
    def predict_rating(self, user_id, anime_id, genre_weight=0.3):  # Adjusted default weight
        """Predict rating using enhanced hybrid approach"""
        try:
            # 1. Collaborative Filtering Component
            user_index = self.user_anime_matrix.index.get_loc(user_id)
            user_vector = self.user_anime_sparse[user_index]
            distances, indices = self.model.kneighbors(user_vector.reshape(1, -1))
            
            # Enhanced distance-to-weight conversion
            weights = np.exp(-distances.flatten())  # Exponential decay
            similar_users = self.user_anime_matrix.index[indices.flatten()]
            
            similar_ratings = self.train_df[
                (self.train_df['user_id'].isin(similar_users)) & 
                (self.train_df['anime_id'] == anime_id)
            ]
            
            if len(similar_ratings) > 0:
                weighted_sum = np.sum(weights * similar_ratings['rating'].values)
                weight_sum = np.sum(weights)
                cf_rating = weighted_sum / weight_sum if weight_sum > 0 else self.user_means.get(user_id, self.global_mean)
            else:
                cf_rating = self.user_means.get(user_id, self.global_mean)
            
            # 2 Genre-Based Component
            anime_idx = self.anime_df[self.anime_df['anime_id'] == anime_id].index[0]
            user_rated_anime = self.train_df[self.train_df['user_id'] == user_id]
            
            if not user_rated_anime.empty:
                genre_similarities = []
                genre_ratings = []
                
                for _, row in user_rated_anime.iterrows():
                    rated_anime_idx = self.anime_df[self.anime_df['anime_id'] == row['anime_id']].index[0]
                    similarity = cosine_similarity(
                        self.genre_matrix[anime_idx],
                        self.genre_matrix[rated_anime_idx]
                    )[0, 0]
                    
                    
                    genre_similarities.append(similarity)
                    genre_ratings.append(row['rating'])
                
                # Use top-k most similar items only
                top_k = 10
                if len(genre_similarities) > top_k:
                    top_indices = np.argsort(genre_similarities)[-top_k:]
                    genre_similarities = np.array(genre_similarities)[top_indices]
                    genre_ratings = np.array(genre_ratings)[top_indices]
                
                genre_rating = np.average(genre_ratings, weights=genre_similarities)
            else:
                genre_rating = self.global_mean
            
            # 3. Enhanced prediction combination
            confidence_cf = len(similar_ratings) / 10  # Scale factor for CF confidence
            confidence_cb = len(genre_similarities) if 'genre_similarities' in locals() else 0
            confidence_cb = confidence_cb / 10  # Scale factor for CB confidence
            
            # Adjust weights based on confidence
            total_confidence = confidence_cf + confidence_cb
            if total_confidence > 0:
                cf_weight = (1 - genre_weight) * (confidence_cf / total_confidence)
                cb_weight = genre_weight * (confidence_cb / total_confidence)
            else:
                cf_weight = 1 - genre_weight
                cb_weight = genre_weight
            
            predicted_rating = (cf_weight * cf_rating + cb_weight * genre_rating)
            
            # Regression to the mean for low-confidence predictions
            confidence = (confidence_cf + confidence_cb) / 2
            predicted_rating = (confidence * predicted_rating + 
                              (1 - confidence) * self.global_mean)
            
            return np.clip(predicted_rating, 1, 10)
            
        except Exception as e:
            return self.user_means.get(user_id, self.global_mean)

def evaluate_hybrid_recommender(rating_df, anime_df, sample_size=20000, test_size=0.33, genre_weight=1, random_state=42):
    # Initialize recommender
    recommender = HybridAnimeRecommender(rating_df, anime_df)
    
    # Preprocess the data
    processed_ratings = recommender.preprocess_data()
    
    # Take a sample if specified
    if sample_size and sample_size < len(processed_ratings):
        processed_ratings = processed_ratings.sample(sample_size, random_state=random_state)
    
    # Split the data
    train_df, test_df = train_test_split(
        processed_ratings,
        test_size=test_size,
        random_state=random_state
    )
    
    # Train the recommender
    recommender.train(train_df)
    
    # Predict ratings for the test set
    print("Evaluating hybrid recommendations...")
    tqdm.pandas()
    test_df['predicted_rating'] = test_df.progress_apply(
        lambda row: recommender.predict_rating(
            row['user_id'], 
            row['anime_id'], 
            genre_weight=genre_weight
        ),
        axis=1
    )
    
    # Calculate metrics
    mae = mean_absolute_error(test_df['rating'], test_df['predicted_rating'])
    rmse = np.sqrt(np.mean((test_df['rating'] - test_df['predicted_rating'])**2))
    
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Square Error (RMSE): {rmse:.4f}")
    
    return recommender, mae, rmse

# Example usage
def compare_recommenders(rating_df, anime_df, sample_size=10000):
    print("Evaluating Collaborative Filtering Recommender:")
    cf_recommender, cf_mae, cf_rmse = evaluate_recommender(rating_df, anime_df, sample_size=sample_size)
    
    print("\nEvaluating Hybrid Recommender:")
    hybrid_recommender, hybrid_mae, hybrid_rmse = evaluate_hybrid_recommender(rating_df, anime_df, sample_size=sample_size)
    
    print("\nComparison Summary:")
    print(f"{'Method':<25} {'MAE':<10} {'RMSE':<10}")
    print("-" * 45)
    print(f"{'Collaborative Filtering':<25} {cf_mae:<10.4f} {cf_rmse:<10.4f}")
    print(f"{'Hybrid Approach':<25} {hybrid_mae:<10.4f} {hybrid_rmse:<10.4f}")
    
    return cf_recommender, hybrid_recommender

In [5]:
cf_recommender, hybrid_recommender = compare_recommenders(rating_df, anime_df, sample_size=100000)


Evaluating Collaborative Filtering Recommender:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings['normalized_rating'] = (filtered_ratings['rating'] - user_mean) / user_std.fillna(1)


Evaluating recommendations...


100%|██████████| 33000/33000 [00:44<00:00, 738.83it/s]


Mean Absolute Error (MAE): 1.6500
Root Mean Square Error (RMSE): 2.1711

Evaluating Hybrid Recommender:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ratings['normalized_rating'] = np.clip(


Evaluating hybrid recommendations...


100%|██████████| 33000/33000 [03:53<00:00, 141.60it/s]

Mean Absolute Error (MAE): 1.2043
Root Mean Square Error (RMSE): 1.5479

Comparison Summary:
Method                    MAE        RMSE      
---------------------------------------------
Collaborative Filtering   1.6500     2.1711    
Hybrid Approach           1.2043     1.5479    



