In [1]:
# Core Libraries
import pandas as pd
import numpy as np

# For Collaborative Filtering
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise.accuracy import rmse

# For Content-Based Filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Visualization (optional)
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Load movies and ratings datasets
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')

# Preview datasets
print("Movies Dataset:")
print(movies.head())
print("\nRatings Dataset:")
print(ratings.head())


Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Dataset:
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40


In [3]:

# Select only the required columns
movies = movies[['movieId', 'title', 'genres']]

# Replace '|' in genres with spaces for easier processing
movies['genres'] = movies['genres'].fillna('unknown').str.replace('|', ' ')

# Preview the updated movies dataset
print(movies.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure Animation Children Comedy Fantasy  
1                   Adventure Children Fantasy  
2                               Comedy Romance  
3                         Comedy Drama Romance  
4                                       Comedy  


In [4]:
# Merge the ratings dataset with movies
ratings = ratings.merge(movies, on='movieId', how='left')

# Preview the merged dataset
print("Merged Ratings Dataset:")
print(ratings.head())


Merged Ratings Dataset:
   userId  movieId  rating            timestamp  \
0       1        2     3.5  2005-04-02 23:53:47   
1       1       29     3.5  2005-04-02 23:31:16   
2       1       32     3.5  2005-04-02 23:33:39   
3       1       47     3.5  2005-04-02 23:32:07   
4       1       50     3.5  2005-04-02 23:29:40   

                                               title  \
0                                     Jumanji (1995)   
1  City of Lost Children, The (Cité des enfants p...   
2          Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
3                        Seven (a.k.a. Se7en) (1995)   
4                         Usual Suspects, The (1995)   

                                   genres  
0              Adventure Children Fantasy  
1  Adventure Drama Fantasy Mystery Sci-Fi  
2                 Mystery Sci-Fi Thriller  
3                        Mystery Thriller  
4                  Crime Mystery Thriller  


In [5]:
# Define the reader for Surprise
reader = Reader(rating_scale=(0.5, 5.0))

# Load the dataset into Surprise format
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


In [6]:
# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [7]:
# Initialize the SVD model
svd = SVD()

# Train the model on the training set
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ebc60490520>

In [8]:
# Check if the user_id exists in the ratings dataset
print(f"User IDs: {ratings['userId'].unique()[:10]}")  # Print the first 10 user IDs
print(f"Movie IDs: {ratings['movieId'].unique()[:10]}")  # Print the first 10 movie IDs

# Example to verify if specific user_id and movie_id exist
user_id = 1
movie_id = 2
if user_id not in ratings['userId'].unique():
    print(f"User ID {user_id} does not exist in the dataset.")
if movie_id not in ratings['movieId'].unique():
    print(f"Movie ID {movie_id} does not exist in the dataset.")


User IDs: [ 1  2  3  4  5  6  7  8  9 10]
Movie IDs: [  2  29  32  47  50 112 151 223 253 260]


In [9]:
# Handle new user or movie
if user_id not in ratings['userId'].unique():
    print(f"User ID {user_id} is new. Use average user ratings.")
if movie_id not in ratings['movieId'].unique():
    print(f"Movie ID {movie_id} is new. Use average movie ratings.")


In [10]:
# Import necessary libraries
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

# Load data into Surprise's format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train the SVD model
svd = SVD()
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ebc7fd80c40>

In [11]:
# Check the number of users and items in the training set
print(f"Number of users in training set: {trainset.n_users}")
print(f"Number of items (movies) in training set: {trainset.n_items}")

# Convert raw user_id and movie_id to internal IDs
internal_uid = trainset.to_inner_uid(1)  # user_id = 1
internal_iid = trainset.to_inner_iid(2)  # movie_id = 2
print(f"Internal user ID: {internal_uid}, Internal movie ID: {internal_iid}")


Number of users in training set: 138493
Number of items (movies) in training set: 25857
Internal user ID: 37910, Internal movie ID: 570


In [12]:
# Predict the rating for user_id = 1 and movie_id = 2
prediction = svd.predict(1, 2)
print(f"Predicted Rating for User 1 and Movie 2: {prediction.est}")


Predicted Rating for User 1 and Movie 2: 3.733775644347758


In [13]:
avg_rating = ratings[ratings['movieId'] == 2]['rating'].mean()
print(f"Average Rating for Movie ID 2: {avg_rating}")


Average Rating for Movie ID 2: 3.2119768016904193


In [14]:
avg_user_rating = ratings[ratings['userId'] == 1]['rating'].mean()
print(f"Average Rating Given by User ID 1: {avg_user_rating}")


Average Rating Given by User ID 1: 3.742857142857143


In [15]:
def recommend_movies(user_id, n=5):
    # Get all unique movie IDs
    all_movie_ids = ratings['movieId'].unique()
    
    # Get movies already rated by the user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].unique()
    
    # Filter out movies already rated
    unrated_movies = [movie for movie in all_movie_ids if movie not in rated_movies]
    
    # Predict ratings for unrated movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in unrated_movies]
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommendations
    top_n = predictions[:n]
    recommendations = [(movies[movies['movieId'] == pred.iid]['title'].values[0], round(pred.est, 2)) for pred in top_n]
    return recommendations

# Get top 5 recommendations for User 1
recommendations = recommend_movies(user_id=1, n=5)
print("Top 5 Recommendations:")
for title, rating in recommendations:
    print(f"{title}: Predicted Rating {rating}")


Top 5 Recommendations:
Prime Suspect (1991): Predicted Rating 4.52
Frozen Planet (2011): Predicted Rating 4.48
Harry Potter and the Deathly Hallows: Part 2 (2011): Predicted Rating 4.46
Harry Potter and the Deathly Hallows: Part 1 (2010): Predicted Rating 4.45
Bleak House (2005): Predicted Rating 4.43


In [16]:
import pickle

# Save the trained SVD model
with open('/kaggle/working/svd_model.pkl', 'wb') as file:
    pickle.dump(svd, file)

print("Model saved as svd_model.pkl")


Model saved as svd_model.pkl


In [17]:
movies.to_csv('/kaggle/working/movies.csv', index=False)
ratings.to_csv('/kaggle/working/ratings.csv', index=False)
