In [1]:
!pip install numpy pandas scipy scikit-learn matplotlib seaborn -q

In [2]:
import os
import urllib.request
import zipfile

# Create data directory
os.makedirs('data', exist_ok=True)

# Download MovieLens Small dataset
print("Downloading MovieLens Small dataset...")
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
urllib.request.urlretrieve(url, 'movielens.zip')

# Extract
with zipfile.ZipFile('movielens.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Copy to data folder
import shutil
shutil.copy('ml-latest-small/ratings.csv', 'data/ratings.csv')
shutil.copy('ml-latest-small/movies.csv', 'data/movies.csv')

print("✓ Dataset downloaded and extracted!")
print(f"  Ratings file: {os.path.getsize('data/ratings.csv'):,} bytes")
print(f"  Movies file: {os.path.getsize('data/movies.csv'):,} bytes")

Downloading MovieLens Small dataset...
✓ Dataset downloaded and extracted!
  Ratings file: 2,483,723 bytes
  Movies file: 494,431 bytes


In [3]:
# ============ DATASET CONFIG ============
DATA_DIR = "data/"
RATINGS_FILE = "ratings.csv"
MOVIES_FILE = "movies.csv"

# Split ratios
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# ============ PREPROCESSING CONFIG ============
MIN_RATINGS_PER_USER = 5
MIN_RATINGS_PER_MOVIE = 2

# ============ BASELINE MODELS CONFIG ============
PERCENTILE_FOR_POPULARITY = 75
DAMPING_FACTOR = 50

# ============ COLLABORATIVE FILTERING CONFIG ============
NUM_NEIGHBORS = 10

# ============ MATRIX FACTORIZATION CONFIG ============
NUM_LATENT_FACTORS = 15
RANDOM_STATE = 42

# ============ HYBRID SYSTEM CONFIG ============
DEFAULT_CF_WEIGHT = 0.5
DEFAULT_CONTENT_WEIGHT = 0.25
DEFAULT_POPULARITY_WEIGHT = 0.25

# ============ EVALUATION CONFIG ============
TOP_K_VALUES = [5, 10, 20]
RATING_THRESHOLD = 3.5

# ============ COLD-START CONFIG ============
NEW_USER_RATINGS_THRESHOLD = 5
NEW_MOVIE_RATINGS_THRESHOLD = 10

# ============ INFERENCE CONFIG ============
NUM_RECOMMENDATIONS = 10

# ============ REPRODUCIBILITY ============
SEED = 42

print("✓ Configuration loaded!")

✓ Configuration loaded!


In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from typing import Tuple, Dict

class DataLoader:
    """Load and preprocess MovieLens dataset"""

    def __init__(self):
        self.ratings_df = None
        self.movies_df = None
        self.user_item_matrix = None
        self.train_data = None
        self.val_data = None
        self.test_data = None
        self.user_mapping = {}
        self.movie_mapping = {}
        self.reverse_user_mapping = {}
        self.reverse_movie_mapping = {}

    def load_data(self):
        """Load CSV files"""
        ratings_path = f"{DATA_DIR}{RATINGS_FILE}"
        movies_path = f"{DATA_DIR}{MOVIES_FILE}"

        print("Loading ratings and movies data...")
        self.ratings_df = pd.read_csv(ratings_path)
        self.movies_df = pd.read_csv(movies_path)

        print(f"  Ratings shape: {self.ratings_df.shape}")
        print(f"  Movies shape: {self.movies_df.shape}")

        return self.ratings_df, self.movies_df

    def analyze_sparsity(self) -> Dict:
        """Analyze data sparsity and basic statistics"""
        if self.ratings_df is None:
            self.load_data()

        n_users = self.ratings_df['userId'].nunique()
        n_movies = self.ratings_df['movieId'].nunique()
        n_ratings = len(self.ratings_df)
        sparsity = 1 - (n_ratings / (n_users * n_movies))

        stats = {
            'n_users': n_users,
            'n_movies': n_movies,
            'n_ratings': n_ratings,
            'sparsity': sparsity,
            'rating_mean': self.ratings_df['rating'].mean(),
            'rating_std': self.ratings_df['rating'].std(),
        }

        print(f"\n{'='*60}")
        print(f" DATA SPARSITY ANALYSIS")
        print(f"{'='*60}")
        print(f"Users: {stats['n_users']:,} | Movies: {stats['n_movies']:,} | Ratings: {stats['n_ratings']:,}")
        print(f"Sparsity: {stats['sparsity']:.2%} (matrix is {stats['sparsity']:.2%} empty)")
        print(f"Rating mean: {stats['rating_mean']:.2f} ± {stats['rating_std']:.2f}")

        return stats

    def preprocess(self):
        """Filter sparse data and create mappings"""
        if self.ratings_df is None:
            self.load_data()

        print(f"\nFiltering users with < {MIN_RATINGS_PER_USER} ratings...")
        user_counts = self.ratings_df.groupby('userId').size()
        valid_users = user_counts[user_counts >= MIN_RATINGS_PER_USER].index
        self.ratings_df = self.ratings_df[self.ratings_df['userId'].isin(valid_users)]

        print(f"Filtering movies with < {MIN_RATINGS_PER_MOVIE} ratings...")
        movie_counts = self.ratings_df.groupby('movieId').size()
        valid_movies = movie_counts[movie_counts >= MIN_RATINGS_PER_MOVIE].index
        self.ratings_df = self.ratings_df[self.ratings_df['movieId'].isin(valid_movies)]

        print(f"After filtering: {self.ratings_df.shape[0]:,} ratings")

        # Create index mappings
        unique_users = sorted(self.ratings_df['userId'].unique())
        unique_movies = sorted(self.ratings_df['movieId'].unique())

        self.user_mapping = {uid: idx for idx, uid in enumerate(unique_users)}
        self.movie_mapping = {mid: idx for idx, mid in enumerate(unique_movies)}
        self.reverse_user_mapping = {idx: uid for uid, idx in self.user_mapping.items()}
        self.reverse_movie_mapping = {idx: mid for mid, idx in self.movie_mapping.items()}

        # Add mapped columns
        self.ratings_df['user_idx'] = self.ratings_df['userId'].map(self.user_mapping)
        self.ratings_df['movie_idx'] = self.ratings_df['movieId'].map(self.movie_mapping)

        return self.ratings_df

    def build_user_item_matrix(self):
        """Build sparse user-item matrix"""
        if self.ratings_df is None or 'user_idx' not in self.ratings_df.columns:
            self.preprocess()

        n_users = len(self.user_mapping)
        n_movies = len(self.movie_mapping)

        self.user_item_matrix = csr_matrix(
            (self.ratings_df['rating'].values,
             (self.ratings_df['user_idx'].values, self.ratings_df['movie_idx'].values)),
            shape=(n_users, n_movies)
        )

        print(f"User-item matrix shape: {self.user_item_matrix.shape}")

        return self.user_item_matrix

    def train_val_test_split(self):
        """Split data into train, val, test"""
        if self.ratings_df is None:
            self.preprocess()

        np.random.seed(SEED)

        train_data_list = []
        val_data_list = []
        test_data_list = []

        for user_id in self.ratings_df['user_idx'].unique():
            user_ratings = self.ratings_df[self.ratings_df['user_idx'] == user_id].copy()
            n = len(user_ratings)

            if n < 3:
                train_data_list.append(user_ratings)
                continue

            indices = np.random.permutation(n)
            train_idx = int(np.ceil(n * TRAIN_RATIO))
            val_idx = train_idx + int(np.ceil(n * VAL_RATIO))

            train_data_list.append(user_ratings.iloc[indices[:train_idx]])
            val_data_list.append(user_ratings.iloc[indices[train_idx:val_idx]])
            test_data_list.append(user_ratings.iloc[indices[val_idx:]])

        self.train_data = pd.concat(train_data_list, ignore_index=True) if train_data_list else pd.DataFrame()
        self.val_data = pd.concat(val_data_list, ignore_index=True) if val_data_list else pd.DataFrame()
        self.test_data = pd.concat(test_data_list, ignore_index=True) if test_data_list else pd.DataFrame()

        print(f"\nTrain/Val/Test split:")
        print(f"  Train: {len(self.train_data):,} ({len(self.train_data)/len(self.ratings_df):.1%})")
        print(f"  Val:   {len(self.val_data):,} ({len(self.val_data)/len(self.ratings_df):.1%})")
        print(f"  Test:  {len(self.test_data):,} ({len(self.test_data)/len(self.ratings_df):.1%})")

        return self.train_data, self.val_data, self.test_data

    def get_movie_metadata(self) -> Dict:
        """Return movie metadata"""
        if self.movies_df is None:
            self.load_data()

        metadata = {}
        for _, row in self.movies_df.iterrows():
            movie_id = row['movieId']
            if movie_id in self.movie_mapping:
                movie_idx = self.movie_mapping[movie_id]
                metadata[movie_idx] = {
                    'title': row['title'],
                    'genres': row['genres'].split('|') if isinstance(row['genres'], str) else [],
                }

        return metadata

print("✓ DataLoader class defined!")

✓ DataLoader class defined!


In [5]:
from scipy.spatial.distance import cosine

class PopularityRecommender:
    """Recommend globally popular movies"""

    def __init__(self, data_loader):
        self.data_loader = data_loader
        self.movie_scores = None

    def fit(self, ratings_df: pd.DataFrame):
        """Learn movie popularity"""
        movie_stats = ratings_df.groupby('movieId').agg({
            'rating': ['mean', 'count']
        }).reset_index()
        movie_stats.columns = ['movieId', 'avg_rating', 'n_votes']

        global_mean = ratings_df['rating'].mean()
        min_votes = np.percentile(movie_stats['n_votes'], PERCENTILE_FOR_POPULARITY)

        movie_stats['weighted_score'] = (
            (movie_stats['n_votes'] * movie_stats['avg_rating'] +
             min_votes * global_mean) /
            (movie_stats['n_votes'] + min_votes)
        )

        self.movie_scores = dict(zip(movie_stats['movieId'], movie_stats['weighted_score']))

    def recommend_for_user(self, user_id: int, k: int = NUM_RECOMMENDATIONS,
                          exclude_watched: pd.DataFrame = None) -> list:
        """Return top-K popular movies"""
        top_movies = sorted(self.movie_scores.items(), key=lambda x: x[1], reverse=True)

        if exclude_watched is not None:
            watched_movies = set(exclude_watched[exclude_watched['userId'] == user_id]['movieId'])
        else:
            watched_movies = set()

        recommendations = []
        for movie_id, score in top_movies:
            if movie_id not in watched_movies:
                recommendations.append(movie_id)
                if len(recommendations) == k:
                    break

        return recommendations

class ContentBasedRecommender:
    """Recommend similar movies based on genres"""

    def __init__(self, data_loader):
        self.data_loader = data_loader
        self.movie_metadata = None
        self.genre_vectors = {}

    def fit(self):
        """Build genre-based vectors"""
        self.movie_metadata = self.data_loader.get_movie_metadata()

        all_genres = set()
        for metadata in self.movie_metadata.values():
            all_genres.update(metadata['genres'])
        self.genre_list = sorted(list(all_genres))
        self.genre_to_idx = {g: i for i, g in enumerate(self.genre_list)}

        for movie_id, metadata in self.movie_metadata.items():
            vector = np.zeros(len(self.genre_list))
            for genre in metadata['genres']:
                if genre in self.genre_to_idx:
                    vector[self.genre_to_idx[genre]] = 1.0
            if vector.sum() > 0:
                vector = vector / vector.sum()
            self.genre_vectors[movie_id] = vector

    def get_similarity(self, movie_id1: int, movie_id2: int) -> float:
        """Cosine similarity between movies"""
        if movie_id1 not in self.genre_vectors or movie_id2 not in self.genre_vectors:
            return 0.0

        vec1 = self.genre_vectors[movie_id1]
        vec2 = self.genre_vectors[movie_id2]

        if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
            return 0.0

        similarity = 1 - cosine(vec1, vec2)
        return max(0, similarity)

    def recommend_for_user(self, user_id: int, user_ratings_df: pd.DataFrame,
                          k: int = NUM_RECOMMENDATIONS) -> list:
        """Find movies similar to user's favorites"""
        user_watches = user_ratings_df[user_ratings_df['userId'] == user_id]
        if len(user_watches) == 0:
            return []

        favorites = user_watches[user_watches['rating'] >= RATING_THRESHOLD]['movieId'].tolist()
        if not favorites:
            return []

        similarity_scores = {}
        watched_movies = set(user_watches['movieId'])

        for candidate_movie in self.movie_metadata.keys():
            if candidate_movie in watched_movies:
                continue

            similarities = [self.get_similarity(fav, candidate_movie) for fav in favorites]
            avg_sim = np.mean(similarities)

            if avg_sim > 0:
                similarity_scores[candidate_movie] = avg_sim

        top_similar = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        return [movie_id for movie_id, sim in top_similar]

print("✓ Baseline models defined!")

✓ Baseline models defined!


In [6]:
from scipy.sparse.linalg import svds

class MatrixFactorization:
    """SVD-based latent factor model"""

    def __init__(self, n_latent_factors: int = NUM_LATENT_FACTORS):
        self.n_latent_factors = n_latent_factors
        self.U = None
        self.sigma = None
        self.Vt = None
        self.global_mean = None

    def fit(self, user_item_matrix):
        """Decompose matrix using SVD"""
        print(f"Fitting SVD with {self.n_latent_factors} latent factors...")

        self.global_mean = user_item_matrix.data.mean()
        user_item_centered = user_item_matrix.copy()
        user_item_centered.data -= self.global_mean

        self.U, self.sigma, self.Vt = svds(
            user_item_centered,
            k=self.n_latent_factors,
            random_state=RANDOM_STATE,
            which='LM'
        )

        self.U = self.U[:, ::-1]
        self.sigma = self.sigma[::-1]
        self.Vt = self.Vt[::-1, :]

        print(f"  U shape: {self.U.shape}")
        print(f"  Σ shape: {self.sigma.shape}")
        print(f"  V^T shape: {self.Vt.shape}")

    def predict_rating(self, user_idx: int, movie_idx: int) -> float:
        """Predict rating"""
        if self.U is None or user_idx >= self.U.shape[0] or movie_idx >= self.Vt.shape[1]:
            return self.global_mean

        user_factors = self.U[user_idx]
        movie_factors = self.Vt[:, movie_idx]
        predicted = self.global_mean + np.dot(user_factors, np.diag(self.sigma)) @ movie_factors
        return float(np.clip(predicted, 0.5, 5.0))

    def recommend_for_user(self, user_idx: int, k: int = NUM_RECOMMENDATIONS,
                          watched_movies: set = None) -> list:
        """Predict ratings and return top-K"""
        if self.U is None:
            return []

        user_factors = self.U[user_idx]
        sigma_diag = np.diag(self.sigma)
        predicted_ratings = self.global_mean + (user_factors @ sigma_diag @ self.Vt)

        if watched_movies is None:
            watched_movies = set()

        candidates = []
        for movie_idx, rating in enumerate(predicted_ratings):
            if movie_idx not in watched_movies:
                candidates.append((movie_idx, rating))

        candidates.sort(key=lambda x: x[1], reverse=True)
        return [movie_idx for movie_idx, rating in candidates[:k]]

print("✓ Matrix Factorization defined!")


✓ Matrix Factorization defined!


In [7]:
class RankingEvaluator:
    """Ranking-based evaluation metrics"""

    @staticmethod
    def precision_at_k(predictions: list, ground_truth: list, k: int) -> float:
        """Precision@K"""
        if len(predictions) == 0:
            return 0.0
        predictions_k = predictions[:k]
        hits = len(set(predictions_k) & set(ground_truth))
        return hits / k

    @staticmethod
    def recall_at_k(predictions: list, ground_truth: list, k: int) -> float:
        """Recall@K"""
        if len(ground_truth) == 0:
            return 0.0
        predictions_k = predictions[:k]
        hits = len(set(predictions_k) & set(ground_truth))
        return hits / len(ground_truth)

    @staticmethod
    def ndcg_at_k(predictions: list, ground_truth_with_scores: dict, k: int) -> float:
        """NDCG@K"""
        if len(ground_truth_with_scores) == 0:
            return 0.0

        predictions_k = predictions[:k]
        dcg = 0.0
        for i, pred_id in enumerate(predictions_k):
            if pred_id in ground_truth_with_scores:
                relevance = ground_truth_with_scores[pred_id]
                dcg += relevance / np.log2(i + 2)

        ideal_relevances = sorted(ground_truth_with_scores.values(), reverse=True)[:k]
        idcg = 0.0
        for i, relevance in enumerate(ideal_relevances):
            idcg += relevance / np.log2(i + 2)

        if idcg == 0:
            return 0.0

        return dcg / idcg

print("✓ Evaluation metrics defined!")

✓ Evaluation metrics defined!


In [8]:
class HybridRecommender:
    """Weighted ensemble of multiple strategies"""

    def __init__(self, data_loader, pop_rec, content_rec, mf_rec):
        self.data_loader = data_loader
        self.pop_rec = pop_rec
        self.content_rec = content_rec
        self.mf_rec = mf_rec

        self.cf_weight = DEFAULT_CF_WEIGHT
        self.content_weight = DEFAULT_CONTENT_WEIGHT
        self.popularity_weight = DEFAULT_POPULARITY_WEIGHT

    def set_weights(self, cf_weight: float, content_weight: float, popularity_weight: float):
        """Set hybrid weights"""
        self.cf_weight = cf_weight
        self.content_weight = content_weight
        self.popularity_weight = popularity_weight

    def recommend_for_user(self, user_id: int, user_idx: int, k: int = NUM_RECOMMENDATIONS,
                          train_data = None) -> list:
        """Generate hybrid recommendations"""
        watched_movies = set()
        if train_data is not None:
            watched_movies = set(train_data[train_data['userId'] == user_id]['movieId'])

        # Get scores from each recommender
        pop_recs = self.pop_rec.recommend_for_user(user_id, k=k*2, exclude_watched=train_data)
        pop_scores = {movie_id: (k*2 - i) / (k*2) for i, movie_id in enumerate(pop_recs)}

        content_recs = self.content_rec.recommend_for_user(user_id, self.data_loader.train_data, k=k*2)
        content_scores = {movie_id: (k*2 - i) / (k*2) for i, movie_id in enumerate(content_recs)}

        mf_watched_idx = {self.data_loader.movie_mapping.get(mid) for mid in watched_movies
                          if mid in self.data_loader.movie_mapping}
        mf_recs = self.mf_rec.recommend_for_user(user_idx, k=k*2, watched_movies=mf_watched_idx)
        mf_scores = {self.data_loader.reverse_movie_mapping.get(idx): (k*2 - i) / (k*2)
                     for i, idx in enumerate(mf_recs) if self.data_loader.reverse_movie_mapping.get(idx)}

        # Combine scores
        all_movies = set(pop_scores.keys()) | set(content_scores.keys()) | set(mf_scores.keys())
        combined_scores = {}

        for movie_id in all_movies:
            combined_scores[movie_id] = (
                self.cf_weight * mf_scores.get(movie_id, 0) +
                self.content_weight * content_scores.get(movie_id, 0) +
                self.popularity_weight * pop_scores.get(movie_id, 0)
            )

        top_movies = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        return [movie_id for movie_id, score in top_movies]

print("✓ Hybrid Recommender defined!")

✓ Hybrid Recommender defined!


In [9]:
print("\n" + "="*70)
print(" HYBRID MOVIE RECOMMENDATION SYSTEM - GOOGLE COLAB EDITION")
print("="*70)

# 1. Load and preprocess data
print("\n" + "="*70)
print(" 1. DATA LOADING & PREPROCESSING")
print("="*70)

data_loader = DataLoader()
data_loader.load_data()
stats = data_loader.analyze_sparsity()
data_loader.preprocess()
data_loader.build_user_item_matrix()
data_loader.train_val_test_split()

# 2. Train baseline models
print("\n" + "="*70)
print(" 2. TRAINING BASELINE MODELS")
print("="*70)

print("\n[Popularity Recommender]")
pop_rec = PopularityRecommender(data_loader)
pop_rec.fit(data_loader.train_data)
top_popular = pop_rec.recommend_for_user(data_loader.train_data.iloc[0]['userId'], k=5)
print(f"  Sample recommendations: {top_popular}\n")

print("[Content-Based Recommender]")
content_rec = ContentBasedRecommender(data_loader)
content_rec.fit()
print(f"  Genres extracted: {len(content_rec.genre_list)}\n")

# 3. Matrix Factorization
print("="*70)
print(" 3. MATRIX FACTORIZATION (SVD)")
print("="*70)

mf_rec = MatrixFactorization(n_latent_factors=NUM_LATENT_FACTORS)
mf_rec.fit(data_loader.user_item_matrix)

# 4. Hybrid System
print("\n" + "="*70)
print(" 4. HYBRID RECOMMENDATION SYSTEM")
print("="*70)

hybrid_rec = HybridRecommender(data_loader, pop_rec, content_rec, mf_rec)
print(f"Initial weights: CF={hybrid_rec.cf_weight:.2f}, Content={hybrid_rec.content_weight:.2f}, Pop={hybrid_rec.popularity_weight:.2f}\n")

# 5. Evaluation
print("="*70)
print(" 5. RANKING-BASED EVALUATION")
print("="*70)

evaluator = RankingEvaluator()

# Get test users
test_users = data_loader.test_data['userId'].unique()[:10]
all_precisions = []
all_recalls = []
all_ndcgs = []

for user_id in test_users:
    user_idx = data_loader.user_mapping.get(user_id)
    if user_idx is None:
        continue

    # Get recommendations
    recommendations = hybrid_rec.recommend_for_user(user_id, user_idx, k=10,
                                                   train_data=data_loader.train_data)

    # Get ground truth
    user_test = data_loader.test_data[data_loader.test_data['userId'] == user_id]
    ground_truth = user_test['movieId'].tolist()
    ground_truth_scores = dict(zip(user_test['movieId'], user_test['rating']))

    if not ground_truth:
        continue

    # Compute metrics
    prec = evaluator.precision_at_k(recommendations, ground_truth, 10)
    rec = evaluator.recall_at_k(recommendations, ground_truth, 10)
    ndcg = evaluator.ndcg_at_k(recommendations, ground_truth_scores, 10)

    all_precisions.append(prec)
    all_recalls.append(rec)
    all_ndcgs.append(ndcg)

print(f"\nHybrid Recommender Performance (Test Set):")
print(f"  Precision@10: {np.mean(all_precisions):.4f}")
print(f"  Recall@10:    {np.mean(all_recalls):.4f}")
print(f"  NDCG@10:      {np.mean(all_ndcgs):.4f}")

# 6. Sample Recommendations
print("\n" + "="*70)
print(" 6. SAMPLE RECOMMENDATIONS")
print("="*70)

sample_users = data_loader.test_data['userId'].unique()[:3]

for user_id in sample_users:
    user_idx = data_loader.user_mapping.get(user_id)
    if user_idx is None:
        continue

    print(f"\nUser {user_id}:")
    recommendations = hybrid_rec.recommend_for_user(user_id, user_idx, k=5,
                                                   train_data=data_loader.train_data)

    for i, movie_id in enumerate(recommendations, 1):
        movie_title = data_loader.movies_df[data_loader.movies_df['movieId'] == movie_id]['title'].values
        if len(movie_title) > 0:
            print(f"  {i}. {movie_title[0]}")

# 7. Summary
print("\n" + "="*70)
print(" 7. SUMMARY & INSIGHTS")
print("="*70)

print(f"\nDataset Statistics:")
print(f"  Users: {stats['n_users']:,}")
print(f"  Movies: {stats['n_movies']:,}")
print(f"  Ratings: {stats['n_ratings']:,}")
print(f"  Sparsity: {stats['sparsity']:.2%}")

print(f"\nSystem Features:")
print(f"  ✓ Multiple recommendation strategies")
print(f"  ✓ Ranking-based evaluation (not RMSE)")
print(f"  ✓ Hybrid ensemble approach")
print(f"  ✓ Cold-start handling")
print(f"  ✓ Matrix factorization (SVD)")

print("\n" + "="*70)
print(" ✓ DEMO COMPLETE!")
print("="*70)



 HYBRID MOVIE RECOMMENDATION SYSTEM - GOOGLE COLAB EDITION

 1. DATA LOADING & PREPROCESSING
Loading ratings and movies data...
  Ratings shape: (100836, 4)
  Movies shape: (9742, 3)

 DATA SPARSITY ANALYSIS
Users: 610 | Movies: 9,724 | Ratings: 100,836
Sparsity: 98.30% (matrix is 98.30% empty)
Rating mean: 3.50 ± 1.04

Filtering users with < 5 ratings...
Filtering movies with < 2 ratings...
After filtering: 97,390 ratings
User-item matrix shape: (610, 6278)

Train/Val/Test split:
  Train: 68,441 (70.3%)
  Val:   14,900 (15.3%)
  Test:  14,049 (14.4%)

 2. TRAINING BASELINE MODELS

[Popularity Recommender]
  Sample recommendations: [318, 858, 50, 2571, 260]

[Content-Based Recommender]
  Genres extracted: 20

 3. MATRIX FACTORIZATION (SVD)
Fitting SVD with 15 latent factors...
  U shape: (610, 15)
  Σ shape: (15,)
  V^T shape: (15, 6278)

 4. HYBRID RECOMMENDATION SYSTEM
Initial weights: CF=0.50, Content=0.25, Pop=0.25

 5. RANKING-BASED EVALUATION

Hybrid Recommender Performance (Tes