In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

ratings = pd.read_csv('RS-A2_A3_Filtered_Ratings.csv')
movies = pd.read_csv('RS-A2_A3_movie.csv')
tags = pd.read_csv('RS-A2_A3_tag.csv')

print("=" * 80)
print("CONTENT-BASED MOVIE RECOMMENDATION SYSTEM")
print("=" * 80)
print("\n1. Data Loading Complete")
print(f"   - Ratings: {len(ratings)} records")
print(f"   - Movies: {len(movies)} records")
print(f"   - Tags: {len(tags)} records")

# Fix: Ensure all tags are strings before joining
tags['tag'] = tags['tag'].fillna('').astype(str)
tags_aggregated = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
tags_aggregated.columns = ['movieId', 'user_tags']

movies_content = movies.merge(tags_aggregated, on='movieId', how='left')
movies_content['user_tags'] = movies_content['user_tags'].fillna('')
movies_content['genres_clean'] = movies_content['genres'].str.replace('|', ' ')
movies_content['content'] = movies_content['genres_clean'] + ' ' + movies_content['user_tags']

print("\n2. Data Preprocessing Complete")
print(f"   - Movies with tags: {movies_content['user_tags'].ne('').sum()}")
print(f"   - Created combined content features (genres + tags)")

tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=500,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf.fit_transform(movies_content['content'])

print("\n3. Feature Extraction (TF-IDF) Complete")
print(f"   - TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"   - Features (sample): {list(tfidf.get_feature_names_out())[:10]}")

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("\n4. Similarity Matrix Computed")
print(f"   - Matrix shape: {cosine_sim.shape}")
print(f"   - Similarity score range: [{cosine_sim.min():.3f}, {cosine_sim.max():.3f}]")

def get_recommendations(movie_title, cosine_sim=cosine_sim, movies_df=movies_content, top_n=5):
    try:
        idx = movies_df[movies_df['title'].str.contains(movie_title, case=False)].index[0]
    except IndexError:
        return f"Movie '{movie_title}' not found in database"

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]

    movie_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]

    recommendations = movies_df.iloc[movie_indices][['movieId', 'title', 'genres']].copy()
    recommendations['similarity_score'] = similarity_scores

    return recommendations

def get_user_recommendations(user_id, ratings_df=ratings, movies_df=movies_content,
                            cosine_sim=cosine_sim, top_n=5):
    user_ratings = ratings_df[ratings_df['userId'] == user_id]

    if len(user_ratings) == 0:
        return f"No ratings found for user {user_id}"

    liked_movies = user_ratings[user_ratings['rating'] >= 4]['movieId'].tolist()

    if len(liked_movies) == 0:
        return f"User {user_id} has no highly rated movies (rating >= 4)"

    liked_indices = movies_df[movies_df['movieId'].isin(liked_movies)].index.tolist()

    sim_scores = cosine_sim[liked_indices].mean(axis=0)

    movie_scores = list(enumerate(sim_scores))
    movie_scores = sorted(movie_scores, key=lambda x: x[1], reverse=True)

    already_rated = user_ratings['movieId'].tolist()
    movie_scores = [(i, score) for i, score in movie_scores
                   if movies_df.iloc[i]['movieId'] not in already_rated]

    movie_scores = movie_scores[:top_n]
    movie_indices = [i[0] for i in movie_scores]
    similarity_scores = [i[1] for i in movie_scores]

    recommendations = movies_df.iloc[movie_indices][['movieId', 'title', 'genres']].copy()
    recommendations['similarity_score'] = similarity_scores

    return recommendations

# def evaluate_recommendations(test_user_id, actual_liked_movies, recommended_movies, k=5):
#     top_k_recommendations = recommended_movies.head(k)['movieId'].tolist()

#     hits = len(set(top_k_recommendations) & set(actual_liked_movies))

#     precision = hits / k if k > 0 else 0
#     recall = hits / len(actual_liked_movies) if len(actual_liked_movies) > 0 else 0
#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

#     return {
#         'user_id': test_user_id,
#         'precision@k': precision,
#         'recall@k': recall,
#         'f1_score': f1_score,
#         'hits': hits,
#         'k': k
#     }

print("\n" + "=" * 80)
print("DEMONSTRATION: MOVIE-BASED RECOMMENDATIONS")
print("=" * 80)

movie_name = "Toy Story"
print(f"\nRecommendations based on '{movie_name}':")
print("-" * 80)
recommendations = get_recommendations(movie_name, top_n=5)
print(recommendations.to_string(index=False))

print("\n" + "=" * 80)
print("DEMONSTRATION: USER-BASED RECOMMENDATIONS")
print("=" * 80)

test_user_id = ratings['userId'].iloc[0]
print(f"\nRecommendations for User {test_user_id}:")
print("-" * 80)
user_recs = get_user_recommendations(test_user_id, top_n=5)
if isinstance(user_recs, pd.DataFrame):
    print(user_recs.to_string(index=False))
else:
    print(user_recs)

# print("\n" + "=" * 80)
# print("EVALUATION")
# print("=" * 80)

# actual_liked = ratings[(ratings['userId'] == test_user_id) &
#                       (ratings['rating'] >= 4)]['movieId'].tolist()
# print(f"\nUser {test_user_id}'s actually liked movies (sample): {actual_liked[:5]}")

# if isinstance(user_recs, pd.DataFrame) and len(actual_liked) > 0:
#     eval_metrics = evaluate_recommendations(test_user_id, actual_liked, user_recs, k=5)
#     print(f"\nEvaluation Metrics (Top-5):")
#     print(f"  - Precision@5: {eval_metrics['precision@k']:.3f}")
#     print(f"  - Recall@5: {eval_metrics['recall@k']:.3f}")
#     print(f"  - F1-Score: {eval_metrics['f1_score']:.3f}")
#     print(f"  - Hits: {eval_metrics['hits']}")

# print("\n" + "=" * 80)
# print("SYSTEM READY FOR USE")
# print("=" * 80)

"""
================================================================================
COMPREHENSIVE EXPLANATION OF THE CONTENT-BASED MOVIE RECOMMENDATION SYSTEM
================================================================================

OVERVIEW:
This is a content-based recommendation system that suggests movies to users based
on movie content (genres and user-generated tags) rather than collaborative filtering
(user-user or item-item ratings). The system uses TF-IDF vectorization and cosine
similarity to find movies with similar content.

================================================================================
SECTION 1: IMPORTS AND DATA LOADING
================================================================================

Lines 1-6: Import necessary libraries
- pandas: For data manipulation and analysis
- numpy: For numerical operations
- TfidfVectorizer: Converts text to numerical feature vectors
- cosine_similarity: Measures similarity between movie feature vectors
- MinMaxScaler: For normalizing data (imported but not used in this version)
- warnings: To suppress unnecessary warnings

Lines 9-11: Load the three CSV files
- ratings.csv: Contains userId, movieId, rating, timestamp
- movies.csv: Contains movieId, title, genres
- tags.csv: Contains userId, movieId, tag, timestamp

Lines 13-19: Display loading statistics
Shows how many records were loaded from each file

================================================================================
SECTION 2: DATA PREPROCESSING
================================================================================

Lines 21-22: Aggregate tags for each movie
- Groups all tags by movieId
- Combines multiple tags into a single string per movie
- Example: If movieId 208 has tags "dark hero", "action", "thriller",
  they become "dark hero action thriller"

Lines 24-27: Merge and clean data
- Merges movies dataframe with aggregated tags using movieId
- Fills missing tags with empty strings (for movies without tags)
- Cleans genres by replacing "|" with spaces (e.g., "Action|Thriller" → "Action Thriller")
- Creates 'content' column combining genres and tags for each movie

Why combine genres and tags?
- Genres provide official categorization
- Tags provide user perspective and nuanced descriptions
- Together they create a richer content profile for each movie

Lines 29-31: Display preprocessing statistics

================================================================================
SECTION 3: FEATURE EXTRACTION USING TF-IDF
================================================================================

Lines 33-37: Initialize TF-IDF Vectorizer
TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic
that reflects how important a word is to a document in a collection.

Parameters explained:
- stop_words='english': Removes common words like "the", "is", "and" that
  don't provide meaningful information for recommendations

- max_features=500: Limits the vocabulary to the 500 most important features
  This prevents overfitting and reduces computational complexity

- ngram_range=(1, 2): Creates features from both:
  * Unigrams (single words): "action", "comedy", "thriller"
  * Bigrams (word pairs): "dark hero", "science fiction"
  This captures both individual concepts and common phrases

Line 40: Create TF-IDF matrix
- Transforms the 'content' text of all movies into numerical vectors
- Each movie becomes a row, each feature (word/phrase) becomes a column
- Values represent the importance of each feature to each movie

How TF-IDF works:
- TF (Term Frequency): How often a term appears in a movie's content
- IDF (Inverse Document Frequency): How rare/unique the term is across all movies
- TF-IDF = TF × IDF (higher values = more distinctive features)

Example: If "space" appears in many sci-fi movies, it gets lower IDF.
         If "neo-noir" appears in few movies, it gets higher IDF.

Lines 42-44: Display feature extraction results

================================================================================
SECTION 4: SIMILARITY COMPUTATION
================================================================================

Line 46: Compute cosine similarity matrix
- Creates a matrix where each cell [i,j] represents similarity between movie i and j
- Uses cosine similarity metric

What is Cosine Similarity?
- Measures the cosine of the angle between two vectors
- Values range from 0 (completely different) to 1 (identical)
- Formula: cos(θ) = (A·B) / (||A|| × ||B||)
- It focuses on direction, not magnitude, making it ideal for text comparison

Example:
Movie A: [0.5, 0.8, 0.0, 0.3]  (TF-IDF vector)
Movie B: [0.4, 0.7, 0.1, 0.2]  (TF-IDF vector)
Cosine similarity = 0.94 (very similar)

Lines 48-50: Display similarity matrix statistics

================================================================================
SECTION 5: MOVIE-BASED RECOMMENDATION FUNCTION
================================================================================

Lines 52-70: get_recommendations() function
Purpose: Given a movie title, find similar movies

How it works:
1. Find the index of the input movie in the dataframe
2. Retrieve similarity scores between this movie and all others
3. Sort movies by similarity score in descending order
4. Exclude the input movie itself (it has similarity 1.0 with itself)
5. Return top N most similar movies with their similarity scores

Input parameters:
- movie_title: Name of the movie to base recommendations on
- cosine_sim: Pre-computed similarity matrix
- movies_df: Dataframe containing movie information
- top_n: Number of recommendations to return (default 5)

Output: DataFrame with columns:
- movieId: Unique identifier
- title: Movie name
- genres: Movie categories
- similarity_score: How similar to the input movie (0-1 scale)

================================================================================
SECTION 6: USER PROFILE-BASED RECOMMENDATION FUNCTION
================================================================================

Lines 72-103: get_user_recommendations() function
Purpose: Given a user ID, recommend movies based on their rating history

How it works:
1. Retrieve all ratings by the user
2. Identify movies the user liked (rating >= 4.0)
3. Find the indices of these liked movies in the movies dataframe
4. Calculate average similarity of each movie to user's liked movies
5. Sort by average similarity
6. Filter out movies the user has already rated
7. Return top N recommendations

Why average similarity?
- If a user liked movies A, B, and C, we want movies similar to ALL of them
- Taking the mean ensures recommendations balance across user preferences
- A movie very similar to one liked movie but dissimilar to others scores lower

Input parameters:
- user_id: User identifier
- ratings_df: DataFrame with user ratings
- movies_df: DataFrame with movie information
- cosine_sim: Pre-computed similarity matrix
- top_n: Number of recommendations

Output: DataFrame with recommended movies and similarity scores

================================================================================
SECTION 7: EVALUATION FUNCTION
================================================================================

Lines 105-122: evaluate_recommendations() function
Purpose: Measure the quality of recommendations

Metrics calculated:

1. Precision@K = (Relevant items in top K recommendations) / K
   - "Of the movies we recommended, how many were actually good?"
   - Higher is better (max = 1.0)
   - Example: Recommended 5 movies, user liked 3 → Precision = 3/5 = 0.6

2. Recall@K = (Relevant items in top K recommendations) / (Total relevant items)
   - "Of all the movies the user would like, how many did we find?"
   - Higher is better (max = 1.0)
   - Example: User likes 10 movies total, we found 3 → Recall = 3/10 = 0.3

3. F1-Score = 2 × (Precision × Recall) / (Precision + Recall)
   - Harmonic mean balancing precision and recall
   - Useful when you want to balance both metrics
   - Higher is better (max = 1.0)

Input parameters:
- test_user_id: User being evaluated
- actual_liked_movies: List of movieIds the user actually liked
- recommended_movies: DataFrame of recommended movies
- k: Number of top recommendations to evaluate

Output: Dictionary with all metrics

================================================================================
SECTION 8: DEMONSTRATION AND TESTING
================================================================================

Lines 124-135: Movie-based recommendation demo
- Tests get_recommendations() with "Toy Story"
- Shows how the system finds similar movies based on content

Lines 137-148: User-based recommendation demo
- Tests get_user_recommendations() with a sample user
- Shows personalized recommendations based on rating history

Lines 150-165: Evaluation demo
- Retrieves movies the user actually liked (rating >= 4)
- Compares recommendations against actual preferences
- Calculates and displays precision, recall, and F1-score

Lines 167-170: System ready message

================================================================================
KEY ADVANTAGES OF THIS SYSTEM
================================================================================

1. No Cold Start Problem for Items:
   - New movies can be recommended immediately if they have genres/tags
   - Doesn't need rating history to recommend a movie

2. Transparency:
   - Recommendations are explainable (based on matching genres/tags)
   - Users can understand why a movie was recommended

3. Serendipity:
   - Can recommend niche movies that match user preferences
   - Not limited to popular items

4. Privacy-Friendly:
   - Doesn't require knowledge of other users' preferences
   - Works with a single user's data

================================================================================
LIMITATIONS OF THIS SYSTEM
================================================================================

1. Limited by Content Descriptions:
   - If genres/tags are poor, recommendations suffer
   - Can't capture subjective qualities (acting, cinematography)

2. Over-Specialization:
   - May keep recommending same type of movies
   - Lacks diversity in recommendations

3. Cold Start for Users:
   - New users with no ratings get no personalized recommendations
   - Requires some rating history for user-based recommendations

4. No Quality Assessment:
   - Doesn't distinguish between good and bad movies of same genre
   - A bad action movie is treated same as a good action movie

================================================================================
POTENTIAL IMPROVEMENTS
================================================================================

1. Hybrid Approach:
   - Combine content-based with collaborative filtering
   - Use ratings data along with content features

2. Enhanced Content Features:
   - Add director, actors, year, plot summaries
   - Use more sophisticated NLP (word embeddings, BERT)

3. Diversity Mechanism:
   - Implement MMR (Maximal Marginal Relevance) to increase variety
   - Balance similarity with diversity

4. Temporal Dynamics:
   - Consider how user preferences change over time
   - Weight recent ratings more heavily

5. Context-Aware Recommendations:
   - Consider time of day, device, mood
   - Adapt recommendations to context

================================================================================
HOW TO USE THIS SYSTEM
================================================================================

For movie-based recommendations:
>>> recs = get_recommendations("Inception", top_n=10)
>>> print(recs)

For user-based recommendations:
>>> user_recs = get_user_recommendations(user_id=12345, top_n=10)
>>> print(user_recs)

For evaluation:
>>> liked_movies = [1, 2, 3, 4, 5]  # movieIds user actually liked
>>> metrics = evaluate_recommendations(12345, liked_movies, user_recs, k=5)
>>> print(f"Precision: {metrics['precision@k']}")
>>> print(f"Recall: {metrics['recall@k']}")

================================================================================
"""

CONTENT-BASED MOVIE RECOMMENDATION SYSTEM

1. Data Loading Complete
   - Ratings: 10000 records
   - Movies: 27278 records
   - Tags: 465564 records

2. Data Preprocessing Complete
   - Movies with tags: 19545
   - Created combined content features (genres + tags)

3. Feature Extraction (TF-IDF) Complete
   - TF-IDF matrix shape: (27278, 500)
   - Features (sample): ['01', '10', '100', '11', '12', '250', '300', '300 ratings', '70mm', 'acting']

4. Similarity Matrix Computed
   - Matrix shape: (27278, 27278)
   - Similarity score range: [0.000, 1.000]

DEMONSTRATION: MOVIE-BASED RECOMMENDATIONS

Recommendations based on 'Toy Story':
--------------------------------------------------------------------------------
 movieId                 title                                      genres  similarity_score
    3114    Toy Story 2 (1999) Adventure|Animation|Children|Comedy|Fantasy          0.945150
    2355  Bug's Life, A (1998)         Adventure|Animation|Children|Comedy          0.874208


