Downloading Dataset

In [23]:
!rm /kaggle/working/ml-25m.zip
!wget https://files.grouplens.org/datasets/movielens/ml-25m.zip

--2024-05-12 16:46:15--  https://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: 'ml-25m.zip'


2024-05-12 16:46:24 (28.8 MB/s) - 'ml-25m.zip' saved [261978986/261978986]



Unzipping the dataset

In [24]:
!rm -rf /kaggle/working/ml-25m
!unzip /kaggle/working/ml-25m.zip

Archive:  /kaggle/working/ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading movies.csv into pandas and creating a dictionary of movie ids and its corresponding names

In [26]:
movies = pd.read_csv('/kaggle/working/ml-25m/movies.csv')
movies = movies.dropna()
movieList = movies['movieId'].values.tolist()
idNameMap = dict(zip(movies['movieId'],movies['title']))

vectorising the combination of movie names and corresponding genres

In [27]:
movies['description'] = movies['title'] + ' ' + movies['genres']
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movie_matrix = tfidf_vectorizer.fit_transform(movies['description'])

In [28]:
!pip install scikit-surprise



In [47]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random 

reading movies with ratings between 0.5 and 5 which are rated by a user

In [30]:
reader = Reader(rating_scale=(0.5, 5.0))
ratings = pd.read_csv('/kaggle/working/ml-25m/ratings.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

finding U,V in the feedback matrix A through SVD so that the final rating will become the corresponding dot product of U,V

In [31]:
model = SVD()
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c63b862a620>

Generating movie recommendations purely based on users history

In [32]:
def content_based(user):
    user_prefs = ratings.groupby('userId').get_group(user)['movieId'].values.tolist()
    user_data = [idNameMap[user_pref] for user_pref in user_prefs]
    user_description = ' '.join(user_data)
    user_vector = tfidf_vectorizer.transform([user_description])
    cos_score = cosine_similarity(user_vector, movie_matrix)
    movie_indices = cos_score.argsort()[0][::-1]
    watched_movies = set(user_data)
    recommended_movies = []
    for idx in movie_indices:
        title = idNameMap[movieList[idx-1]]
        if title not in watched_movies:
            recommended_movies.append(title)
        if len(recommended_movies) == 10:
            break
    return recommended_movies

Generating recommended movies based on users history and users watch history who are similar to the current user

In [33]:
def collaborative_filter_based(user):
    seen_movies = ratings.groupby('userId').get_group(user)['movieId'].values.tolist()
    unseen_movies = [m for m in movieList if m not in seen_movies]
    pred_rates = [model.predict(user, movie_id) for movie_id in unseen_movies]
    top_predictions = sorted(pred_rates, key=lambda x: x.est, reverse=True)[:10]
    recommended_movie_ids = [pred.iid for pred in top_predictions]
    recommended_movies = [idNameMap[idx] for idx in recommended_movie_ids]
    return recommended_movies

Checking the Working of both recommendation Systems

In [34]:
recommended_movies_filter = collaborative_filter_based(1)
recommended_movies_content = content_based(1)
print(recommended_movies_filter)
print(recommended_movies_content)

['Power of Nightmares, The: The Rise of the Politics of Fear (2004)', 'Black Mirror', 'Great Beauty, The (Grande Bellezza, La) (2013)', "It's Such a Beautiful Day (2011)", 'Century of the Self, The (2002)', 'Usual Suspects, The (1995)', 'American Beauty (1999)', 'Jeanne Dielman, 23 Quai du Commerce, 1080 Bruxelles (1975)', 'The Work of Director Chris Cunningham (2003)', 'I, Claudius (1976)']
["I'm with Lucy (2002)", 'Dr. Jekyll and Mr. Hyde (1941)', 'Jack and Jill (2011)', 'What a Mess! (1995)', '7 Years (2006)', 'Ready to Wear (Pret-A-Porter) (1994)', 'How Not to Make a Movie (2013)', 'A Trip (2012)', 'National Treasure (2004)', "Disney's Very Merry Christmas Sing Along Songs (1988)"]


Evaluation Metrics using min square error loss for both recommenders tfidf vectors

In [51]:
user_ids = ratings['userId'].values.tolist()
user_ids = random.choices(list(set(user_ids)),k = 100)
content_based_loss = 0
filter_based_loss = 0
users = ratings.groupby('userId')
for user in tqdm(user_ids):
  user_prefs = users.get_group(user)['movieId'].values.tolist()
  user_data = [idNameMap[user_pref] for user_pref in user_prefs]
  user_description = ' '.join(user_data)
  user_vector = tfidf_vectorizer.transform([user_description]).toarray()
  filter_based = collaborative_filter_based(user)
  filter_desc = ' '.join(filter_based)
  filter_vector = tfidf_vectorizer.transform([filter_desc]).toarray()
  con_based = content_based(user)
  con_desc = ' '.join(con_based)
  con_vector = tfidf_vectorizer.transform([con_desc]).toarray()
  content_based_loss += np.linalg.norm(user_vector - con_vector)
  filter_based_loss += np.linalg.norm(user_vector - filter_vector)


100%|██████████| 100/100 [07:05<00:00,  4.26s/it]


In [52]:
filter_based_loss = filter_based_loss/ len(user_ids)
content_based_loss = content_based_loss/len(user_ids)
print(f"Mean Square Error loss for content based system :{content_based_loss}")
print(f"Mean Square Error loss for collaborative based system :{filter_based_loss}")

Mean Square Error loss for content based system :1.2547805400435257
Mean Square Error loss for collaborative based system :1.367729111134473
