In [2]:
#install below libraries
#pip install pandas numpy scikit-learn surprise
#pip install "numpy<2"
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from collections import defaultdict
import time

# --- Constants ---
RATINGS_FILE = 'RS-A2_A3_Filtered_Ratings.csv'
MOVIES_FILE = 'RS-A2_A3_movie.csv'
TAGS_FILE = 'RS-A2_A3_tag.csv'


HYBRID_ALPHA = 0.7  # 70% CF, 30% CB


def load_data():

    try:
        ratings_df = pd.read_csv(RATINGS_FILE)
        movies_df = pd.read_csv(MOVIES_FILE)
        tags_df = pd.read_csv(TAGS_FILE)

        if 'Unnamed: 0' in ratings_df.columns:
            ratings_df = ratings_df.drop(columns=['Unnamed: 0'])

        return ratings_df, movies_df, tags_df
    except FileNotFoundError as e:
        print(f"Error: {e}. Please make sure all CSV files are in the same directory.")
        return None, None, None



def preprocess_content_data(movies_df, tags_df):

    print("Starting content pre-processing...")

    movies_df['genres_cleaned'] = movies_df['genres'].str.replace('|', ' ', regex=False).fillna("")

    movies_df['genres_cleaned'] = movies_df['genres_cleaned'].str.replace('(no genres listed)', '', regex=False)

    tags_df['tag_cleaned'] = tags_df['tag'].astype(str).str.lower()

    tag_docs = tags_df.groupby('movieId')['tag_cleaned'].apply(lambda x: ' '.join(x))
    tag_docs_df = tag_docs.reset_index()
    tag_docs_df.columns = ['movieId', 'tags_content']

    movies_df = pd.merge(movies_df, tag_docs_df, on='movieId', how='left')
    movies_df['tags_content'] = movies_df['tags_content'].fillna("")

    movies_df['content'] = movies_df['genres_cleaned'] + ' ' + movies_df['tags_content']

    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies_df['content'])

    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    indices = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()

    print("Content pre-processing complete.")
    return cosine_sim, indices, movies_df


def train_cf_model(ratings_df):

    print("Training Collaborative Filtering (SVD) model...")

    min_rating = ratings_df['rating'].min()
    max_rating = ratings_df['rating'].max()
    print(f"Rating scale detected: {min_rating} to {max_rating}")

    reader = Reader(rating_scale=(min_rating, max_rating))
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

    trainset = data.build_full_trainset()

    svd = SVD(n_factors=100, n_epochs=20, random_state=42, verbose=False)

    start_time = time.time()
    svd.fit(trainset)
    end_time = time.time()

    print(f"SVD model training complete. Time taken: {end_time - start_time:.2f} seconds.")
    return svd, min_rating, max_rating


def get_hybrid_recommendations(user_id, movies_df, ratings_df, indices, cosine_sim, svd, min_rating, max_rating, alpha=0.7, n_recs=10, n_candidates=100):

    try:
        rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    except KeyError:
        print(f"Error: User {user_id} not found in ratings data.")
        return []

    user_ratings = ratings_df[ratings_df['userId'] == user_id]['rating']
    if user_ratings.empty:
        print(f"User {user_id} has no ratings. Cannot generate recommendations.")
        return []

    rating_threshold = min(np.percentile(user_ratings, 80), 4.0)
    top_rated_df = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= rating_threshold)]
    top_rated_movies = top_rated_df['movieId'].tolist()

    top_movie_indices = [indices[movie_id] for movie_id in top_rated_movies if movie_id in indices]

    all_movies = movies_df['movieId'].unique()

    unseen_movies = [m for m in all_movies if m not in rated_movies]

    print(f"Predicting ratings for {len(unseen_movies)} unseen movies...")
    cf_candidates = []
    for movie_id in unseen_movies:

        pred = svd.predict(user_id, movie_id)
        cf_candidates.append((movie_id, pred.est))

    cf_candidates.sort(key=lambda x: x[1], reverse=True)

    top_n_cf = cf_candidates[:n_candidates]

    hybrid_recs = []

    if not top_movie_indices:

        print("User has no top-rated movies for content matching. Falling back to pure CF.")
        for movie_id, cf_score in top_n_cf:
            hybrid_recs.append((movie_id, cf_score)) # Score is just the CF score

    else:
        print(f"Re-ranking top {n_candidates} candidates using content data...")
        for movie_id, cf_score in top_n_cf:
            if movie_id not in indices:
                continue # Movie has no content data

            candidate_idx = indices[movie_id]

            sim_scores = cosine_sim[candidate_idx, top_movie_indices]

            cb_score = sim_scores.mean()

            norm_cf = (cf_score - min_rating) / (max_rating - min_rating)

            norm_cb = cb_score

            hybrid_score = (alpha * norm_cf) + ((1 - alpha) * norm_cb)
            hybrid_recs.append((movie_id, hybrid_score))

    hybrid_recs.sort(key=lambda x: x[1], reverse=True)

    final_movie_ids = [m[0] for m in hybrid_recs[:n_recs]]

    movie_id_to_title = pd.Series(movies_df.title.values, index=movies_df.movieId).to_dict()

    final_recommendations = []
    for mid in final_movie_ids:
        title = movie_id_to_title.get(mid, "Unknown Movie")
        final_recommendations.append((title, mid))

    return final_recommendations


def main():

    print("="*50)
    print("Loading data...")
    ratings_df, movies_df, tags_df = load_data()
    if ratings_df is None:
        return
    print(f"Loaded {len(ratings_df)} ratings, {len(movies_df)} movies, {len(tags_df)} tags.")

    print("="*50)
    cosine_sim, indices, movies_df_processed = preprocess_content_data(movies_df.copy(), tags_df.copy())

    print("="*50)
    svd, min_r, max_r = train_cf_model(ratings_df.copy())

    user_counts = ratings_df['userId'].value_counts()
    if user_counts.empty:
        print("No users found in ratings data.")
        return

    example_user_id = user_counts.index[0] # User with the most ratings

    print("="*50)
    print(f"\n--- DEMONSTRATION: User {example_user_id} ---")

    user_ratings = ratings_df[ratings_df['userId'] == example_user_id]
    user_ratings_merged = user_ratings.merge(movies_df_processed, on='movieId', how='left')
    top_5 = user_ratings_merged.sort_values('rating', ascending=False).head(5)

    print(f"\nUser {example_user_id}'s Top 5 Rated Movies (for context):")
    for _, row in top_5.iterrows():
        print(f"  - {row['title']} (Rating: {row['rating']})")

    print("\nCalculating hybrid recommendations...")
    start_time = time.time()
    recommendations = get_hybrid_recommendations(
        user_id=example_user_id,
        movies_df=movies_df_processed,
        ratings_df=ratings_df,
        indices=indices,
        cosine_sim=cosine_sim,
        svd=svd,
        min_rating=min_r,
        max_rating=max_r,
        alpha=HYBRID_ALPHA,
        n_recs=10,
        n_candidates=100
    )
    end_time = time.time()
    print(f"Recommendation generation took {end_time - start_time:.2f} seconds.")

    print(f"\nTop 10 Hybrid Recommendations for User {example_user_id} (Alpha={HYBRID_ALPHA}):")
    if not recommendations:
        print("No recommendations could be generated.")
    else:
        for i, (title, mid) in enumerate(recommendations):
            print(f"  {i+1:2}. {title} (MovieID: {mid})")
    print("="*50)

if __name__ == "__main__":
    main()

Loading data...
Loaded 10000 ratings, 27278 movies, 465564 tags.
Starting content pre-processing...
Content pre-processing complete.
Training Collaborative Filtering (SVD) model...
Rating scale detected: 1.0 to 5.0
SVD model training complete. Time taken: 0.11 seconds.

--- DEMONSTRATION: User 45989 ---

User 45989's Top 5 Rated Movies (for context):
  - Casino (1995) (Rating: 5.0)
  - Rob Roy (1995) (Rating: 5.0)
  - Dances with Wolves (1990) (Rating: 5.0)
  - Braveheart (1995) (Rating: 5.0)
  - Clueless (1995) (Rating: 5.0)

Calculating hybrid recommendations...
Predicting ratings for 27156 unseen movies...
Re-ranking top 100 candidates using content data...
Recommendation generation took 0.24 seconds.

Top 10 Hybrid Recommendations for User 45989 (Alpha=0.7):
   1. Sense and Sensibility (1995) (MovieID: 17)
   2. Schindler's List (1993) (MovieID: 527)
   3. Piano, The (1993) (MovieID: 509)
   4. American President, The (1995) (MovieID: 11)
   5. Like Water for Chocolate (Como agua p

In [None]:
"""
Excellent question ‚Äî this is a Hybrid Recommender System, which combines Collaborative Filtering (CF) and Content-Based Filtering (CBF) to generate personalized movie recommendations.
Let‚Äôs break it down carefully step by step. üëá

üîç 1. Concept Used: Hybrid Recommendation System

This code merges two fundamental recommendation approaches:

üß© (a) Collaborative Filtering (CF)

Idea: Recommend items based on user‚Äìitem interactions (ratings).

Assumption: Users who agreed in the past will agree again in the future.

Technique Used: SVD (Singular Value Decomposition) from the surprise library.

CF learns latent factors for users and items that represent abstract preferences (e.g., "likes sci-fi", "enjoys action").

üß† (b) Content-Based Filtering (CBF)

Idea: Recommend items similar in content to what the user liked before.

Technique Used:

TF-IDF Vectorization of movie genres and tags.

Cosine Similarity to find how similar movies are.

Example: If the user liked ‚ÄúToy Story‚Äù, CBF will suggest similar ‚ÄúAnimation | Adventure | Family‚Äù movies.

‚öñÔ∏è (c) Hybridization (Combination)

The code combines CF and CBF scores using a weight parameter alpha:


HybridScore=Œ±√óCF_score+(1‚àíŒ±)√óCB_score

alpha = 1.0 ‚Üí Pure CF

alpha = 0.0 ‚Üí Pure Content-Based

Default: alpha = 0.7 ‚Üí 70% Collaborative, 30% Content-based

This approach balances between behavioral patterns and item similarity.

‚öôÔ∏è 2. How the Code Works Step-by-Step
Step 1Ô∏è‚É£ ‚Äî Load Data
ratings_df, movies_df, tags_df = load_data()


Reads 3 CSVs: ratings, movies, tags.

Removes unnecessary columns.

ratings_df: userId, movieId, rating

movies_df: movieId, title, genres

tags_df: userId, movieId, tag

Step 2Ô∏è‚É£ ‚Äî Preprocess for Content-Based Filtering
cosine_sim, indices, movies_df = preprocess_content_data(movies_df, tags_df)


Cleans up genres (Action|Drama ‚Üí Action Drama).

Aggregates all tags per movie into a single text field.

Merges genres + tags ‚Üí combined content string.

TF-IDF converts text ‚Üí numerical vector representation.

Cosine similarity creates an NxN similarity matrix (movie vs. movie).

üîπ Each cell (i, j) shows how similar movie i is to movie j.

Step 3Ô∏è‚É£ ‚Äî Train Collaborative Filtering (SVD)
svd, min_r, max_r = train_cf_model(ratings_df)


Uses Surprise library‚Äôs SVD:

Learns latent features of users and movies.

Trains on entire rating dataset.

After training:

The model can predict missing ratings for unseen movies.

Step 4Ô∏è‚É£ ‚Äî Generate Recommendations
get_hybrid_recommendations(...)

a. User Profile Creation

Finds which movies the user rated.

Takes the top-rated movies (‚â• 80th percentile or ‚â• 4.0).

Uses these movies as a content preference reference.

b. Collaborative Filtering Prediction

Predicts CF-based ratings for all unseen movies.

Keeps top 100 candidates with the highest predicted ratings.

c. Content-Based Re-ranking

For each candidate movie:

Calculates similarity with user‚Äôs top-rated movies (from cosine matrix).

Gets mean similarity = CB score.

Normalizes CF and CB scores to [0, 1].

Computes final Hybrid score using the alpha weight.

d. Return Final Top-N

Sorts by hybrid score.

Returns the top 10 movies with titles and IDs.

Step 5Ô∏è‚É£ ‚Äî Example Execution

The main() function:

Loads data, preprocesses, trains CF, and runs one example user.

Displays:

Top 5 movies the user rated.

Top 10 hybrid recommendations.

üìä 3. Metrics / Algorithms Used
Concept	Metric / Algorithm	Purpose
Content-Based	TF-IDF (Term Frequency‚ÄìInverse Document Frequency)	Convert text (genres + tags) into weighted numerical vectors
Content-Based	Cosine Similarity	Measure similarity between movies
Collaborative Filtering	SVD (Matrix Factorization)	Learn latent user-item preferences
Hybridization	Weighted Score (Œ± √ó CF + (1‚ÄìŒ±) √ó CB)	Combine strengths of both systems
Normalization	Min-Max Scaling of CF scores	Bring CF and CB scores to comparable scale
üß™ 4. Is It Giving Correct Recommendations?

‚úÖ Yes, mostly correct ‚Äî conceptually.
Here‚Äôs why:

‚úîÔ∏è Strengths:

CF captures global trends ‚Äî users with similar taste patterns.

CB ensures personalization ‚Äî based on user‚Äôs favorite genres/tags.

Hybrid approach solves problems of:

Cold-start for new users/items.

Overspecialization (CB-only).

Lack of diversity (CF-only).

TF-IDF + cosine similarity ensures textual relevance.

‚ö†Ô∏è Possible Limitations:

No explicit evaluation metric (e.g., RMSE, Precision@K).

The script only demonstrates recommendations, not performance accuracy.

Unoptimized Œ± value ‚Äî fixed at 0.7.

Could be tuned via validation for best results.

Small dataset ‚Üí noise in similarity scores.

TF-IDF text quality depends on tags‚Äô richness.

Cold start for completely new users (no ratings) still unsolved.

So, while the logic is solid, the accuracy depends on:

Data quality (enough ratings and tags),

Correct preprocessing,

Proper parameter tuning (Œ±, TF-IDF params, SVD factors).

üí° 5. Summary (In Simple Words)
Step	Technique	Purpose
Data Loading	CSV ‚Üí DataFrame	Input for the system
Content-Based	TF-IDF + Cosine	Find similar movies
Collaborative	SVD	Predict unseen ratings
Hybrid	Weighted combination	Mix both approaches
Output	Sorted Top-N	Personalized movie list

Formula:

Hybrid Score
=
0.7
√ó
ùê∂
ùêπ
+
0.3
√ó
ùê∂
ùêµ
Hybrid Score=0.7√óCF+0.3√óCB

‚úÖ If the data is good ‚Üí recommendations are realistic and relevant.
‚ùå If user has few ratings ‚Üí fallback to pure CF or less personalized suggestions.

Would you like me to visualize how the hybrid combination changes recommendations (e.g., by comparing CF-only vs CB-only vs Hybrid for one user)?
It will help you see how Œ± impacts final results.
"""