In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from fuzzywuzzy import fuzz, process
import os

In [2]:
# Loading the dataset
file_path = "dataset/"
movie_ratings = pd.read_csv(file_path + "ratings.csv")
movies = pd.read_csv(file_path + "movies.csv")

In [3]:
# Creating a newId for every movie and merging datasets
movies["newId"] = range(1, movies["movieId"].nunique() + 1)
movie_ratings["timestamp"] = movie_ratings["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime("%Y-%m-%d"))
movie_ratings = movie_ratings.merge(movies, how="left", on="movieId")
movie_ratings["movieId"] = movie_ratings["newId"]
movies["movieId"] = movies["newId"]
movie_ratings.drop(["newId"], axis=1, inplace=True)
movies.drop(["newId"], axis=1, inplace=True)

  movie_ratings["timestamp"] = movie_ratings["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime("%Y-%m-%d"))


In [4]:
# Train-test split

file_path = "dataset/"
if not os.path.isfile(file_path + "TrainData.pkl"):
    movie_ratings.iloc[:int(movie_ratings.shape[0] * 0.80)].to_pickle(file_path + "TrainData.pkl")
Train_Data = pd.read_pickle(file_path + "TrainData.pkl")

if not os.path.isfile(file_path + "TestData.pkl"):
    movie_ratings.iloc[int(movie_ratings.shape[0] * 0.80):].to_pickle(file_path + "TestData.pkl")
Test_Data = pd.read_pickle(file_path + "TestData.pkl")

In [5]:
# User-item sparse matrix creation
if not os.path.isfile(file_path + "TrainUISparseData.npz"):
    TrainUISparseData = sparse.csr_matrix((Train_Data.rating, (Train_Data.userId, Train_Data.movieId)))
    sparse.save_npz(file_path + "TrainUISparseData.npz", TrainUISparseData)
else:
    TrainUISparseData = sparse.load_npz(file_path + "TrainUISparseData.npz")

In [6]:
# Movie-Movie similarity computation
if not os.path.isfile(file_path + "m_m_similarity.npz"):
    m_m_similarity = cosine_similarity(TrainUISparseData.T, dense_output=False)
    sparse.save_npz(file_path + "m_m_similarity.npz", m_m_similarity)
else:
    m_m_similarity = sparse.load_npz(file_path + "m_m_similarity.npz")

In [7]:
# Movie recommendation function
def recommend(movie_name):
    movie_list_in_training = Train_Data.drop_duplicates(subset=["title"], keep="first")[["movieId", "title"]].reset_index(drop=True)
    
    # Use fuzzy matching to find the closest match to the entered movie name
    matches = process.extract(movie_name, movie_list_in_training["title"], scorer=fuzz.partial_ratio)
    
    if len(matches) == 0:
        return "No Match Found"
    
    # Get the movie ID of the best match
    movie_id = movie_list_in_training.iloc[matches[0][2]]["movieId"]
    
    # Get the top 10 similar movies
    similar_movie_id_list = np.argsort(-m_m_similarity[movie_id].toarray().ravel())[0:11]  # 10 similar movies + 1 (original movie)
    
    # Get the details of the similar movies
    sm_df = movie_list_in_training[movie_list_in_training["movieId"].isin(similar_movie_id_list)]
    
    # Assign order to the similar movies
    sm_df["order"] = sm_df.apply(lambda x: list(similar_movie_id_list).index(x["movieId"]), axis=1)
    
    return sm_df.sort_values("order")


In [8]:
recommend("harry potter")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sm_df["order"] = sm_df.apply(lambda x: list(similar_movie_id_list).index(x["movieId"]), axis=1)


Unnamed: 0,movieId,title,order
126,4801,Harry Potter and the Sorcerer's Stone (a.k.a. ...,0
140,5718,Harry Potter and the Chamber of Secrets (2002),1
168,7770,Harry Potter and the Prisoner of Azkaban (2004),2
1279,10601,Harry Potter and the Goblet of Fire (2005),3
803,5253,Spider-Man (2002),4
131,4898,"Lord of the Rings: The Fellowship of the Ring,...",5
119,4212,Shrek (2001),6
148,6430,Pirates of the Caribbean: The Curse of the Bla...,7
1123,4791,"Monsters, Inc. (2001)",8
142,5854,"Lord of the Rings: The Two Towers, The (2002)",9


In [9]:
import pickle
with open(file_path + 'similarity.pkl', 'wb') as file:
    pickle.dump(m_m_similarity, file)