In [30]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [31]:
df_links    = pd.read_csv('./links.csv')
df_movies   = pd.read_csv('./movies.csv')
df_ratings  = pd.read_csv('./ratings.csv')
df_tags     = pd.read_csv('./tags.csv')

In [32]:
(df_links.head())

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [33]:
df_links.size

29226

In [34]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [35]:
df_movies.size

29226

In [36]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [37]:
df_ratings.size

403344

In [38]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [39]:
df_tags.size

14732

In [40]:
# Transforming the ratings data into a user-item matrix
user_item_matrix = df_ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
# Compute the cosine similarity between items
item_similarity = cosine_similarity(user_item_matrix.T)  # Transpose to get item-item matrix
item_similarity

array([[1.        , 0.41056206, 0.2969169 , ..., 0.        , 0.        ,
        0.        ],
       [0.41056206, 1.        , 0.28243799, ..., 0.        , 0.        ,
        0.        ],
       [0.2969169 , 0.28243799, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [43]:
# Creating a DataFrame for the item similarity matrix for better readability
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

In [44]:
# Display the shape of the user-item matrix and a portion of the item similarity matrix
user_item_matrix_shape = user_item_matrix.shape
item_similarity_df_sample = item_similarity_df.iloc[:5, :5]

In [45]:
(user_item_matrix_shape, item_similarity_df_sample)


((610, 9724),
 movieId         1         2         3         4         5
 movieId                                                  
 1        1.000000  0.410562  0.296917  0.035573  0.308762
 2        0.410562  1.000000  0.282438  0.106415  0.287795
 3        0.296917  0.282438  1.000000  0.092406  0.417802
 4        0.035573  0.106415  0.092406  1.000000  0.188376
 5        0.308762  0.287795  0.417802  0.188376  1.000000)

In [46]:
# Define the recommendation function
def recommend_movies(user_id, user_item_matrix, item_similarity_df, top_n=5):
    # Get the movies rated by the user
    rated_movies = user_item_matrix.loc[user_id]
    rated_movies = rated_movies[rated_movies > 0].index.tolist()
    
    # Get the similarity scores for these movies and sum them up
    sim_scores = item_similarity_df[rated_movies].sum(axis=1)
    
    # Remove the movies already rated by the user
    sim_scores = sim_scores.drop(index=rated_movies)
    
    # Get the top N movie recommendations
    recommended_movie_ids = sim_scores.sort_values(ascending=False).head(top_n).index
    recommended_movies = df_movies[df_movies['movieId'].isin(recommended_movie_ids)]
    
    return recommended_movies

In [48]:
user_id_example = 2
recommended_movies_example = recommend_movies(user_id_example, user_item_matrix, item_similarity_df, top_n=5)
recommended_movies_example

Unnamed: 0,movieId,title,genres
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller
5917,33794,Batman Begins (2005),Action|Crime|IMAX
6743,59315,Iron Man (2008),Action|Adventure|Sci-Fi
7043,69122,"Hangover, The (2009)",Comedy|Crime
7413,80463,"Social Network, The (2010)",Drama
