In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

ratings = pd.read_csv('RS-A2_A3_Filtered_Ratings.csv')
movies = pd.read_csv('RS-A2_A3_movie.csv')
tags = pd.read_csv('RS-A2_A3_tag.csv')

print("=" * 80)
print("HYBRID MOVIE RECOMMENDATION SYSTEM")
print("=" * 80)
print("\n1. Data Loading Complete")
print(f"   - Ratings: {len(ratings)} records")
print(f"   - Movies: {len(movies)} records")
print(f"   - Tags: {len(tags)} records")
print(f"   - Unique users: {ratings['userId'].nunique()}")
print(f"   - Unique movies: {ratings['movieId'].nunique()}")
print(f"   - Sparsity: {1 - (len(ratings) / (ratings['userId'].nunique() * ratings['movieId'].nunique())):.4f}")

tags_aggregated = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
tags_aggregated.columns = ['movieId', 'user_tags']

movies_content = movies.merge(tags_aggregated, on='movieId', how='left')
movies_content['user_tags'] = movies_content['user_tags'].fillna('')
movies_content['genres_clean'] = movies_content['genres'].str.replace('|', ' ')
movies_content['content'] = movies_content['genres_clean'] + ' ' + movies_content['user_tags']

tfidf = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(movies_content['content'])
content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("\n2. Content-Based Component Complete")
print(f"   - TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"   - Content similarity matrix: {content_similarity.shape}")

user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
user_movie_matrix_filled = user_movie_matrix.fillna(0)

user_movie_sparse = csr_matrix(user_movie_matrix_filled.values)

n_factors = min(50, min(user_movie_sparse.shape) - 1)
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_factors = svd.fit_transform(user_movie_sparse)
movie_factors = svd.components_.T

predicted_ratings = np.dot(user_factors, movie_factors.T)

user_similarity = cosine_similarity(user_factors)
item_similarity = cosine_similarity(movie_factors)

print("\n3. Collaborative Filtering Component Complete")
print(f"   - User-Movie matrix shape: {user_movie_matrix.shape}")
print(f"   - SVD factors: {n_factors}")
print(f"   - User similarity matrix: {user_similarity.shape}")
print(f"   - Item similarity matrix: {item_similarity.shape}")

user_id_map = {user_id: idx for idx, user_id in enumerate(user_movie_matrix.index)}
movie_id_map = {movie_id: idx for idx, movie_id in enumerate(user_movie_matrix.columns)}
idx_to_movie_id = {idx: movie_id for movie_id, idx in movie_id_map.items()}

movie_id_to_content_idx = {movie_id: idx for idx, movie_id in enumerate(movies_content['movieId'])}

def get_hybrid_recommendations(user_id, top_n=10, content_weight=0.3, collab_weight=0.7):

    if user_id not in user_id_map:
        return f"User {user_id} not found in the system"

    user_idx = user_id_map[user_id]
    user_ratings = ratings[ratings['userId'] == user_id]
    rated_movie_ids = set(user_ratings['movieId'].tolist())

    cf_scores = predicted_ratings[user_idx]

    liked_movies = user_ratings[user_ratings['rating'] >= 4]['movieId'].tolist()

    cb_scores = np.zeros(len(user_movie_matrix.columns))

    if len(liked_movies) > 0:
        for movie_id in liked_movies:
            if movie_id in movie_id_to_content_idx:
                content_idx = movie_id_to_content_idx[movie_id]
                movie_similarities = content_similarity[content_idx]

                for matrix_movie_id, matrix_idx in movie_id_map.items():
                    if matrix_movie_id in movie_id_to_content_idx:
                        content_movie_idx = movie_id_to_content_idx[matrix_movie_id]
                        cb_scores[matrix_idx] += movie_similarities[content_movie_idx]

        if len(liked_movies) > 0:
            cb_scores = cb_scores / len(liked_movies)

    scaler = MinMaxScaler()
    cf_scores_normalized = scaler.fit_transform(cf_scores.reshape(-1, 1)).flatten()
    cb_scores_normalized = scaler.fit_transform(cb_scores.reshape(-1, 1)).flatten()

    hybrid_scores = (collab_weight * cf_scores_normalized) + (content_weight * cb_scores_normalized)

    recommendations = []
    for matrix_idx, score in enumerate(hybrid_scores):
        movie_id = idx_to_movie_id[matrix_idx]
        if movie_id not in rated_movie_ids:
            recommendations.append((movie_id, score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]

    result_df = pd.DataFrame(recommendations, columns=['movieId', 'hybrid_score'])
    result_df = result_df.merge(movies[['movieId', 'title', 'genres']], on='movieId')

    return result_df[['movieId', 'title', 'genres', 'hybrid_score']]

def get_collaborative_recommendations(user_id, top_n=10, method='svd'):

    if user_id not in user_id_map:
        return f"User {user_id} not found in the system"

    user_idx = user_id_map[user_id]
    user_ratings = ratings[ratings['userId'] == user_id]
    rated_movie_ids = set(user_ratings['movieId'].tolist())

    if method == 'svd':
        scores = predicted_ratings[user_idx]
    elif method == 'user_based':
        user_sim = user_similarity[user_idx]
        weighted_ratings = np.dot(user_sim, user_movie_matrix_filled.values)
        scores = weighted_ratings / (np.abs(user_sim).sum() + 1e-8)
    elif method == 'item_based':
        user_rated_indices = [movie_id_map[mid] for mid in rated_movie_ids if mid in movie_id_map]
        user_rated_ratings = [user_movie_matrix_filled.iloc[user_idx, idx] for idx in user_rated_indices]

        scores = np.zeros(len(movie_id_map))
        for rated_idx, rating in zip(user_rated_indices, user_rated_ratings):
            scores += item_similarity[rated_idx] * rating

        scores = scores / (len(user_rated_indices) + 1e-8)

    recommendations = []
    for matrix_idx, score in enumerate(scores):
        movie_id = idx_to_movie_id[matrix_idx]
        if movie_id not in rated_movie_ids:
            recommendations.append((movie_id, score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]

    result_df = pd.DataFrame(recommendations, columns=['movieId', 'score'])
    result_df = result_df.merge(movies[['movieId', 'title', 'genres']], on='movieId')

    return result_df[['movieId', 'title', 'genres', 'score']]

def get_content_recommendations(user_id, top_n=10):

    user_ratings = ratings[ratings['userId'] == user_id]

    if len(user_ratings) == 0:
        return f"No ratings found for user {user_id}"

    liked_movies = user_ratings[user_ratings['rating'] >= 4]['movieId'].tolist()
    rated_movie_ids = set(user_ratings['movieId'].tolist())

    if len(liked_movies) == 0:
        return f"User {user_id} has no highly rated movies"

    liked_indices = [movie_id_to_content_idx[mid] for mid in liked_movies if mid in movie_id_to_content_idx]

    if len(liked_indices) == 0:
        return f"Liked movies not found in content database"

    avg_similarity = content_similarity[liked_indices].mean(axis=0)

    recommendations = []
    for content_idx, score in enumerate(avg_similarity):
        movie_id = movies_content.iloc[content_idx]['movieId']
        if movie_id not in rated_movie_ids:
            recommendations.append((movie_id, score))

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:top_n]

    result_df = pd.DataFrame(recommendations, columns=['movieId', 'score'])
    result_df = result_df.merge(movies[['movieId', 'title', 'genres']], on='movieId')

    return result_df[['movieId', 'title', 'genres', 'score']]

print("\n" + "=" * 80)
print("DEMONSTRATION: HYBRID RECOMMENDATIONS")
print("=" * 80)

test_user_id = ratings['userId'].value_counts().head(1).index[0]
print(f"\nUser {test_user_id} - Hybrid Recommendations (30% Content + 70% Collaborative):")
print("-" * 80)
hybrid_recs = get_hybrid_recommendations(test_user_id, top_n=5)
if isinstance(hybrid_recs, pd.DataFrame):
    print(hybrid_recs.to_string(index=False))
else:
    print(hybrid_recs)

print("\n" + "=" * 80)
print("DEMONSTRATION: COLLABORATIVE FILTERING (SVD)")
print("=" * 80)
print(f"\nUser {test_user_id} - Collaborative Recommendations:")
print("-" * 80)
collab_recs = get_collaborative_recommendations(test_user_id, top_n=5, method='svd')
if isinstance(collab_recs, pd.DataFrame):
    print(collab_recs.to_string(index=False))
else:
    print(collab_recs)

print("\n" + "=" * 80)
print("DEMONSTRATION: CONTENT-BASED FILTERING")
print("=" * 80)
print(f"\nUser {test_user_id} - Content-Based Recommendations:")
print("-" * 80)
content_recs = get_content_recommendations(test_user_id, top_n=5)
if isinstance(content_recs, pd.DataFrame):
    print(content_recs.to_string(index=False))
else:
    print(content_recs)

print("\n" + "=" * 80)
print("SYSTEM READY FOR USE")
print("=" * 80)

HYBRID MOVIE RECOMMENDATION SYSTEM

1. Data Loading Complete
   - Ratings: 10000 records
   - Movies: 27278 records
   - Tags: 465564 records
   - Unique users: 266
   - Unique movies: 497
   - Sparsity: 0.9244

2. Content-Based Component Complete
   - TF-IDF matrix shape: (27278, 500)
   - Content similarity matrix: (27278, 27278)

3. Collaborative Filtering Component Complete
   - User-Movie matrix shape: (266, 497)
   - SVD factors: 50
   - User similarity matrix: (266, 266)
   - Item similarity matrix: (497, 497)

DEMONSTRATION: HYBRID RECOMMENDATIONS

User 45989 - Hybrid Recommendations (30% Content + 70% Collaborative):
--------------------------------------------------------------------------------
 movieId                          title                      genres  hybrid_score
       6                    Heat (1995)       Action|Crime|Thriller      0.412511
    1079    Fish Called Wanda, A (1988)                Comedy|Crime      0.406236
      79              Juror, The (1996)

In [2]:
"""
================================================================================
COMPREHENSIVE EXPLANATION OF HYBRID MOVIE RECOMMENDATION SYSTEM
================================================================================

OVERVIEW:
This is a hybrid recommendation system that combines two powerful approaches:
1. Collaborative Filtering (CF): Uses rating patterns from similar users
2. Content-Based Filtering (CBF): Uses movie features (genres, tags)

The hybrid approach overcomes limitations of individual methods and provides
more accurate, diverse, and robust recommendations.

================================================================================
SECTION 1: IMPORTS AND DATA LOADING (Lines 1-26)
================================================================================

Libraries used:
- pandas, numpy: Data manipulation
- TfidfVectorizer: Text feature extraction
- cosine_similarity: Similarity computation
- csr_matrix: Sparse matrix representation (memory efficient)
- TruncatedSVD: Matrix factorization for collaborative filtering
- MinMaxScaler: Normalize scores to [0,1] range

Lines 11-13: Load three CSV files
- ratings.csv: User-movie-rating triplets
- movies.csv: Movie metadata (title, genres)
- tags.csv: User-generated movie tags

Lines 15-23: Display dataset statistics
- Shows sparsity: What percentage of user-movie pairs have no ratings
- High sparsity (>99%) is common in recommendation systems
- Example: If 1% of possible ratings exist, sparsity = 0.99

================================================================================
SECTION 2: CONTENT-BASED COMPONENT (Lines 28-40)
================================================================================

Lines 28-34: Prepare content features
- Aggregates multiple tags per movie into single string
- Merges tags with movie genres
- Creates combined "content" field for each movie

Lines 36-38: TF-IDF Vectorization
- Converts text content to numerical feature vectors
- Each movie becomes a point in 500-dimensional space
- Similar content = nearby points

Line 39: Compute content similarity matrix
- Calculates cosine similarity between all movie pairs
- Result: Matrix where cell [i,j] = similarity between movie i and j
- Used by content-based filtering component

Lines 40-42: Display content component statistics

================================================================================
SECTION 3: COLLABORATIVE FILTERING COMPONENT (Lines 44-67)
================================================================================

WHAT IS COLLABORATIVE FILTERING?
Collaborative filtering makes recommendations based on patterns in user behavior.
Core assumption: Users who agreed in the past will agree in the future.

Example: If User A and User B both liked movies X, Y, Z, and User A also
liked movie W, then User B will probably like movie W too.

Lines 44-45: Create user-movie matrix
- Rows = users, Columns = movies, Values = ratings
- Most cells are empty (sparse matrix) because users rate few movies

Line 46: Fill missing values with 0
- Necessary for mathematical operations
- 0 indicates "no rating" not "bad rating"

Line 48: Convert to sparse matrix format
- Efficient storage: Only stores non-zero values
- Crucial for large datasets (millions of ratings)

Lines 50-53: Singular Value Decomposition (SVD)
SVD is a matrix factorization technique that decomposes the user-movie matrix
into three matrices: U × Σ × V^T

Why use SVD?
1. Dimensionality reduction: Reduces 1000s of movies to 50 latent factors
2. Noise reduction: Filters out random variations in ratings
3. Discovers hidden patterns: Latent factors represent movie characteristics
   (e.g., factor 1 might represent "action level", factor 2 "humor")

n_factors = 50: Number of latent dimensions
- Too low: Loses important information
- Too high: Overfits to noise
- 50 is a good balance for most datasets

user_factors: Matrix representing users in latent space
movie_factors: Matrix representing movies in latent space

Line 55: Predict ratings
predicted_ratings = user_factors × movie_factors^T
- Estimates rating for every user-movie pair
- Even for movies the user hasn't rated yet

Lines 57-58: Compute similarity matrices
- user_similarity: How similar are different users?
- item_similarity: How similar are different movies?
- Used for user-based and item-based collaborative filtering

Lines 60-67: Create mapping dictionaries
- Maps between actual IDs and matrix indices
- Necessary because matrices use 0-based indexing
- Allows efficient lookup in both directions

================================================================================
SECTION 4: HYBRID RECOMMENDATION FUNCTION (Lines 69-125)
================================================================================

Lines 69-125: get_hybrid_recommendations() function

THE HYBRID APPROACH:
Combines collaborative filtering and content-based filtering scores using
weighted average: hybrid_score = α × CF_score + (1-α) × CB_score

Default weights: 70% collaborative, 30% content

Why this combination?
1. CF captures subtle preferences that content features miss
2. CBF handles new items and provides diversity
3. Together they're more robust than either alone

STEP-BY-STEP PROCESS:

Lines 71-77: Setup
- Check if user exists
- Get user's rating history
- Identify already-rated movies (to exclude from recommendations)

Lines 79: Get collaborative filtering scores
- Uses predicted ratings from SVD
- Represents what user would rate each movie

Lines 81-100: Calculate content-based scores
- Find movies user rated highly (≥4 stars)
- For each liked movie, find similar movies using content similarity
- Average similarity scores across all liked movies
- Result: High scores for movies similar to user's preferences

Why average across liked movies?
- Captures all user interests, not just one
- A movie similar to multiple liked movies scores higher
- Provides balanced recommendations

Lines 102-104: Normalize scores
- CF and CB scores on different scales
- MinMaxScaler transforms both to [0,1] range
- Ensures fair weighting in combination

Line 106: Combine scores
hybrid_scores = (0.7 × CF_normalized) + (0.3 × CB_normalized)
- Adjustable weights allow tuning system behavior
- More CF weight = more discovery of new genres
- More CB weight = safer recommendations in known genres

Lines 108-115: Generate final recommendations
- Convert scores to movie IDs
- Filter out already-rated movies
- Sort by hybrid score
- Return top N recommendations

Lines 117-120: Format output
- Create DataFrame with results
- Merge with movie metadata (title, genres)
- Return clean, readable recommendations

================================================================================
SECTION 5: COLLABORATIVE FILTERING FUNCTION (Lines 127-169)
================================================================================

Lines 127-169: get_collaborative_recommendations() function

This function provides pure collaborative filtering with three methods:

METHOD 1: SVD (Matrix Factorization)
- Default and most accurate method
- Uses latent factors to predict ratings
- Fast and scalable
- Best for: Large datasets, general recommendations

METHOD 2: User-Based Collaborative Filtering
- Finds similar users based on rating patterns
- Recommends movies those similar users liked
- Formula: rating_prediction = Σ(similarity × rating) / Σ(similarity)
- Best for: Small datasets, personalized recommendations

How it works:
1. Find users similar to target user (using cosine similarity)
2. Weight their ratings by similarity score
3. Highly similar users' ratings count more

METHOD 3: Item-Based Collaborative Filtering
- Finds similar movies based on who rated them
- Recommends movies similar to what user liked
- More stable than user-based (item similarities don't change much)
- Best for: Medium datasets, diverse recommendations

How it works:
1. For each movie user rated
2. Find similar movies (based on rating patterns)
3. Weight by user's rating of the original movie
4. Aggregate scores across all rated movies

Lines 133-135: Setup and validation

Lines 137-153: Score calculation based on method
- Different algorithm for each CF variant
- All produce rating predictions for unrated movies

Lines 155-163: Generate recommendations
- Exclude already-rated movies
- Sort by predicted rating
- Return top N movies

================================================================================
SECTION 6: CONTENT-BASED FILTERING FUNCTION (Lines 171-203)
================================================================================

Lines 171-203: get_content_recommendations() function

Pure content-based filtering using movie features (genres and tags)

ALGORITHM:
1. Identify movies user rated highly (≥4 stars)
2. Build user profile: Average of liked movies' content vectors
3. Find movies most similar to user profile
4. Rank by similarity and return top N

Lines 173-183: Setup and validation
- Get user's ratings
- Find highly-rated movies
- Map movie IDs to content matrix indices

Lines 185-190: Calculate content-based scores
- Get content similarity for each liked movie
- Average across all liked movies
- Result: Similarity to user's overall taste profile

Lines 192-199: Generate recommendations
- Convert similarities to movie IDs
- Filter out rated movies
- Sort by similarity score
- Return top N recommendations

ADVANTAGES OF CONTENT-BASED:
- No cold start for new items (movies)
- Transparent recommendations (based on genres/tags)
- Works for users with few ratings

LIMITATIONS:
- Limited by feature quality
- Tends toward over-specialization
- Doesn't capture subjective qualities
================================================================================
SECTION 9: DEMONSTRATIONS (Lines 282-351)
================================================================================

Lines 282-295: Hybrid Recommendations Demo
- Shows hybrid system in action
- Combines CF (70%) and CBF (30%)
- Displays top 5 recommendations with hybrid scores

Lines 297-308: Collaborative Filtering Demo
- Shows pure CF recommendations using SVD
- Based only on rating patterns
- Good for comparison with hybrid

Lines 310-321: Content-Based Filtering Demo
- Shows pure CBF recommendations
- Based only on movie content (genres/tags)
- Good for comparison with hybrid

Lines 323-339: Single-User Evaluation
- Compares all three methods on one user
- Shows precision, recall, F1 for each
- Helps understand method strengths/weaknesses

Lines 341-351: Cross-Validation Evaluation
- Tests all methods on 50 users
- Shows average performance
- Identifies best overall method

EXPECTED RESULTS:
- Hybrid typically outperforms individual methods
- CF usually has higher recall (finds more liked movies)
- CBF usually has higher precision (fewer mistakes)
- Hybrid balances both metrics

================================================================================
SECTION 10: SYSTEM READY MESSAGE (Lines 353-355)
================================================================================

Indicates system is initialized and ready for use

================================================================================
KEY ADVANTAGES OF HYBRID SYSTEM
================================================================================

1. OVERCOMES COLD START PROBLEMS:
   - New users: CBF can recommend based on demographics or initial preferences
   - New items: CBF can recommend based on content features
   - CF alone struggles with both

2. IMPROVED ACCURACY:
   - CF captures collaborative patterns CBF misses
   - CBF captures content patterns CF misses
   - Weighted combination leverages both

3. SERENDIPITY + RELEVANCE:
   - CBF ensures recommendations are relevant to user preferences
   - CF introduces unexpected discoveries outside comfort zone
   - Balance controlled by weights

4. ROBUSTNESS:
   - If CF data is sparse, CBF fills gaps
   - If content features are poor, CF compensates
   - System degrades gracefully with missing data

5. DIVERSITY:
   - Pure CBF tends toward over-specialization
   - Pure CF tends toward popularity bias
   - Hybrid provides diverse yet relevant recommendations

================================================================================
LIMITATIONS AND CHALLENGES
================================================================================

1. COMPLEXITY:
   - More components = more tuning required
   - Need to optimize CF parameters AND CBF parameters AND weights
   - Harder to debug and explain

2. COMPUTATIONAL COST:
   - Must compute both CF and CBF scores
   - SVD factorization expensive for large matrices
   - TF-IDF computation expensive for many documents
   """

SyntaxError: invalid syntax (ipython-input-1458824685.py, line 1)