In [1]:
import pandas as pd

In [3]:
ratings_df = pd.read_csv('/Users/priyankamalavade/Desktop/movie_recommendation_system/ml-latest-small/ratings.csv')
movies_df = pd.read_csv('/Users/priyankamalavade/Desktop/movie_recommendation_system/ml-latest-small/movies.csv')

In [4]:
print("Ratings DataFrame:")
print(ratings_df.head())

print("\nMovies DataFrame:")
print(movies_df.head())

Ratings DataFrame:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


# Merging the DataFrames

In [5]:
# Merge the dataframes on the 'movieId' column
movie_ratings = pd.merge(ratings_df, movies_df, on='movieId')

# Display the first few rows of the new, merged dataframe
print(movie_ratings.head())

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


### creating a user-item matrix 

* What is a User-Item Matrix? 
* A user-item matrix is a table where:

* Rows represent the unique users.

* Columns represent the unique movies.

* The values at each intersection are the ratings that a user gave to a movie.

* If a user has not rated a specific movie, the value is typically NaN (Not a Number) or zero.



In [6]:
# Create the user-item matrix using pivot_table
user_movie_matrix = movie_ratings.pivot_table(index='userId', columns='title', values='rating')

# Display the first few rows of the new matrix
print(user_movie_matrix.head())

title   '71 (2014)  'Hellboy': The Seeds of Creation (2004)  \
userId                                                        
1              NaN                                      NaN   
2              NaN                                      NaN   
3              NaN                                      NaN   
4              NaN                                      NaN   
5              NaN                                      NaN   

title   'Round Midnight (1986)  'Salem's Lot (2004)  \
userId                                                
1                          NaN                  NaN   
2                          NaN                  NaN   
3                          NaN                  NaN   
4                          NaN                  NaN   
5                          NaN                  NaN   

title   'Til There Was You (1997)  'Tis the Season for Love (2015)  \
userId                                                               
1                             Na

# Finding User Similarities

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Fill NaN values with 0
user_movie_matrix_filled = user_movie_matrix.fillna(0)

In [9]:
# Calculate the cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix_filled)

# Convert the similarity matrix into a DataFrame for easier handling
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Print a small part of the new similarity matrix
print(user_similarity_df.head())

userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.027283  0.059720  0.194395  0.129080  0.128152  0.158744   
2       0.027283  1.000000  0.000000  0.003726  0.016614  0.025333  0.027585   
3       0.059720  0.000000  1.000000  0.002251  0.005020  0.003936  0.000000   
4       0.194395  0.003726  0.002251  1.000000  0.128659  0.088491  0.115120   
5       0.129080  0.016614  0.005020  0.128659  1.000000  0.300349  0.108342   

userId       8         9         10   ...       601       602       603  \
userId                                ...                                 
1       0.136968  0.064263  0.016875  ...  0.080554  0.164455  0.221486   
2       0.027257  0.000000  0.067445  ...  0.202671  0.016866  0.011997   
3       0.004941  0.000000  0.000000  ...  0.005048  0.004892  0.024992   
4       0.062969  0.011361  0.031163  ...  0.085938  0.128273  0

# Generating Recommendations for a Target User

In [10]:
# Define the target user and number of similar users (neighbors)
target_user_id = 1
num_similar_users = 10

In [11]:
# Find the top N most similar users to the target user
# Get the similarity scores for the target user, sort them, and get the top N
similar_users = user_similarity_df.loc[target_user_id].sort_values(ascending=False)
similar_users = similar_users[1:num_similar_users+1] # Exclude the user themselves (similarity is 1)
print(f"Top {num_similar_users} similar users to User {target_user_id}:")
print(similar_users)


Top 10 similar users to User 1:
userId
266    0.357408
313    0.351562
368    0.345127
57     0.345034
91     0.334727
469    0.330664
39     0.329782
288    0.329700
452    0.328048
45     0.327922
Name: 1, dtype: float64


In [12]:
# 3. Get the movies rated by the target user and the similar users
# Find movies the target user has already rated to exclude them from recommendations
movies_watched_by_user = user_movie_matrix.loc[target_user_id][user_movie_matrix.loc[target_user_id].notna()].index.tolist()


In [13]:
# Get the ratings of the similar users
similar_users_ratings = user_movie_matrix.loc[similar_users.index]


In [14]:
# Calculate weighted average for movie recommendations
recommendations = {}
for movie in similar_users_ratings.columns:
    if movie not in movies_watched_by_user:
        # Get the ratings for the current movie from all similar users
        movie_ratings_from_similar_users = similar_users_ratings[movie]
        
        # Filter out NaN values (movies not rated by the similar user)
        # We need to drop these for accurate weighted average calculation
        non_na_ratings = movie_ratings_from_similar_users.dropna()
        
        if not non_na_ratings.empty:
            # Get the similarity scores for the users who rated this movie
            relevant_similarities = similar_users.loc[non_na_ratings.index]
            
            # Calculate the weighted average rating
            weighted_average = (non_na_ratings * relevant_similarities).sum() / relevant_similarities.sum()
            recommendations[movie] = weighted_average


In [15]:
# Convert the recommendations dictionary to a DataFrame and sort
recommendations_df = pd.DataFrame(recommendations.items(), columns=['Title', 'Predicted_Rating'])
recommendations_df = recommendations_df.sort_values(by='Predicted_Rating', ascending=False)


In [16]:
#Display the top N recommendations
print("\nTop 10 Movie Recommendations:")
print(recommendations_df.head(10))


Top 10 Movie Recommendations:
                                                  Title  Predicted_Rating
1669                                     Yojimbo (1961)               5.0
1594                                         UHF (1989)               5.0
1620                         Waiting for Guffman (1996)               5.0
1625  Wallace & Gromit: The Best of Aardman Animatio...               5.0
1608                                Verdict, The (1982)               5.0
1575                                     Tremors (1990)               5.0
1577                     Trial, The (Procès, Le) (1962)               5.0
1587               Twin Peaks: Fire Walk with Me (1992)               5.0
1562                               Touch of Evil (1958)               5.0
1282                    River Runs Through It, A (1992)               5.0


# Evaluating the Model's Performance

* use fillna(0) because a rating of 0 is a placeholder for a movie a user hasn't rated, which is what our model needs to work with.

In [17]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing sets
train_df, test_df = train_test_split(movie_ratings, test_size=0.2, random_state=42)

# Create user-item matrices for both training and testing sets
train_user_movie_matrix = train_df.pivot_table(index='userId', columns='title', values='rating').fillna(0)
test_user_movie_matrix = test_df.pivot_table(index='userId', columns='title', values='rating').fillna(0)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity for the training set
train_user_similarity = cosine_similarity(train_user_movie_matrix)
train_user_similarity_df = pd.DataFrame(train_user_similarity, 
                                        index=train_user_movie_matrix.index, 
                                        columns=train_user_movie_matrix.index)

In [20]:
# A function to predict a single rating for a user and movie
def predict_rating(user_id, movie_title):
    # Get the ratings for the movie from all users
    movie_ratings = train_user_movie_matrix[movie_title]
    
    # Get the similarity scores for the target user
    user_similarities = train_user_similarity_df[user_id]
    
    # Find users who rated the movie
    users_who_rated = movie_ratings[movie_ratings > 0].index
    
    # Get the ratings and similarities for those users
    ratings = movie_ratings.loc[users_who_rated]
    similarities = user_similarities.loc[users_who_rated]
    
    # Calculate the weighted average of ratings
    if similarities.sum() > 0:
        predicted_rating = np.dot(ratings, similarities) / similarities.sum()
    else:
        # Fallback if no similar users have rated the movie
        # To avoid the error, get the single mean value using .item() or .mean()
        # and handle the case of an empty Series.
        if not users_who_rated.empty:
            predicted_rating = train_user_movie_matrix.loc[users_who_rated, movie_title].mean()
        else:
            predicted_rating = np.nan
    
    # Ensure a single value is always returned
    return predicted_rating

# Predict ratings for all rows in the test set
predicted_ratings = []
actual_ratings = []

# Iterating through all test set ratings
for index, row in test_df.iterrows():
    user_id = row['userId']
    movie_title = row['title']
    
    # Check if user and movie are in the training set
    if user_id in train_user_movie_matrix.index and movie_title in train_user_movie_matrix.columns:
        predicted_ratings.append(predict_rating(user_id, movie_title))
        actual_ratings.append(row['rating'])

# Handle cases where prediction is not possible
# For simplicity, we can just remove them from evaluation
valid_predictions = [pred for pred in predicted_ratings if not np.isnan(pred)]
valid_actuals = [actual for actual, pred in zip(actual_ratings, predicted_ratings) if not np.isnan(pred)]

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(valid_actuals, valid_predictions))

# Print the result
print(f"The Root Mean Square Error (RMSE) of the model is: {rmse:.4f}")

The Root Mean Square Error (RMSE) of the model is: 0.9691


On a rating scale of 0.5 to 5.0, an RMSE of 0.9691 means that model's predictions are, on average, off by about 0.97 points. This is a strong indicator that model has learned meaningful patterns from the training data and can make reasonably accurate predictions on unseen data. It's a solid baseline for a recommendation system.

