In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [3]:
movies = pd.read_csv(r'C:\Users\subha\Downloads\movies.csv')
ratings = pd.read_csv(r'C:\Users\subha\Downloads\ratings.csv')

In [13]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [19]:
movies.shape

(9742, 3)

In [21]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [23]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [25]:
ratings.shape

(100836, 4)

In [27]:
# Merge datasets
merged_df = ratings.merge(movies, on="movieId", how="left")

In [29]:
# Create user-item matrix
user_movie_matrix = merged_df.pivot(index="userId", columns="movieId", values="rating")
sparse_matrix = csr_matrix(user_movie_matrix.fillna(0))

In [31]:
# Apply SVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(sparse_matrix)
movie_factors = svd.components_

In [33]:
# Predict ratings
predicted_ratings = user_factors @ movie_factors

In [35]:
# Compute accuracy using RMSE
actual_ratings = user_movie_matrix.fillna(0).values
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"RMSE: {rmse:.4f}")

RMSE: 0.3520


In [37]:
# Recommend top movies for a given user
def recommend_movies(user_id, num_recommendations=5):
    user_index = user_id - 1  # Adjust for zero-based index
    user_ratings = predicted_ratings[user_index]
    top_movie_indices = user_ratings.argsort()[::-1][:num_recommendations]
    
    recommended_movies = movies[movies["movieId"].isin(user_movie_matrix.columns[top_movie_indices])]
    return recommended_movies[["title"]]

In [40]:
# Example usage
print(recommend_movies(user_id=1, num_recommendations=5))

                                                  title
224           Star Wars: Episode IV - A New Hope (1977)
898   Star Wars: Episode V - The Empire Strikes Back...
899                          Princess Bride, The (1987)
911   Star Wars: Episode VI - Return of the Jedi (1983)
1939                                 Matrix, The (1999)
