# Recommendation System for MoveLens Dataset using SVD

In [1]:
# Import libraries
import numpy as np
import pandas as pd

# To load the 'ratings' and 'movies' dataset after uploading them to Jupyter notebook

In [2]:
# Reading ratings file

ratings = pd.read_csv('ratings.csv', usecols=['userId','movieId','rating','timestamp'])

In [3]:
# Reading movies file

movies = pd.read_csv('movies.csv', usecols=['movieId','title','genres'])

In [4]:
# Print first five rows of movies datset

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Print first five rows of ratings datset

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# To find the unique number of users and movies in the 'ratings' dataset

In [6]:

n_users = ratings.userId.unique().shape[0]

n_movies = ratings.movieId.unique().shape[0]

print(f'Number of users = {n_users} and Number of movies = {n_movies}')

Number of users = 7120 and Number of movies = 14026


# To create a rating matrix for the 'ratings' dataset

In [7]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# To install the scikit-surprise library for implementing SVD

### Run the following command in the Anaconda Prompt to install surprise package

In [8]:
# !conda install -c conda-forge scikit-surprise

In [11]:
# ! pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'error'
Failed to build scikit-surprise


  error: subprocess-exited-with-error
  
  Building wheel for scikit-surprise (pyproject.toml) did not run successfully.
  exit code: 1
  
  [115 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-39\surprise
  copying surprise\accuracy.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\builtin_datasets.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\dataset.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\dump.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\reader.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\trainset.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\utils.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\__init__.py -> build\lib.win-amd64-cpython-39\surprise
  copying surprise\__main__.py -> build\lib.win-amd64-cpython-39\surprise
  creating build\lib.win-amd64-cpython-39\surprise\model_selectio

In [12]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD # There is also an algorithm  version SVD++ 
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8437  0.8464  0.8446  0.8449  0.0011  
MAE (testset)     0.6461  0.6486  0.6465  0.6471  0.0011  
Fit time          9.85    11.21   10.16   10.41   0.58    
Test time         3.86    4.26    3.62    3.91    0.26    


{'test_rmse': array([0.8436695 , 0.84637293, 0.84459622]),
 'test_mae': array([0.64607972, 0.64858462, 0.64654286]),
 'fit_time': (9.846702814102173, 11.212627649307251, 10.16105604171753),
 'test_time': (3.8649182319641113, 4.2562339305877686, 3.615189790725708)}

In [13]:
# Print the head of ratings dataset
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# To find all the movies rated as more than 4 stars by user with userId = 1

In [14]:
ratings_1 = ratings[(ratings['userId'] == 1) & (ratings['rating'] >= 4)]
ratings_1 = ratings_1.set_index('movieId')
ratings_1 = ratings_1.join(movies)['title']
ratings_1.head(10)

movieId
151                                 Batman Forever (1995)
223                                      Dream Man (1995)
253                                         Junior (1994)
260                              Ladybird Ladybird (1994)
293                                   Pulp Fiction (1994)
296                                         Priest (1994)
318     Strawberry and Chocolate (Fresa y chocolate) (...
541                                          Harem (1985)
1036                                          Jude (1996)
1079                                       Top Gun (1986)
Name: title, dtype: object

# Train an SVD to predict ratings for user with userId = 1

In [15]:
# Create a shallow copy for the movies dataset
user_1 = movies.copy()

#Reset the index for user_1 dataset
user_1 = user_1.reset_index()



# getting full dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

#Predict the ratings for user1
user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x: svd.predict(1, x).est)

#Drop extra columns from the user1 data frame
user_1 = user_1.drop(['movieId','genres','index'], axis = 1)

# Sort predicted ratings for user1 in descending order
user_1 = user_1.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_1.head(10))

                                                  title  Estimate_Score
5853      Lord of the Rings: The Two Towers, The (2002)        4.657694
4897  Lord of the Rings: The Fellowship of the Ring,...        4.655342
7041  Lord of the Rings: The Return of the King, The...        4.643417
3003                                     Matewan (1987)        4.492067
4912                 Witness for the Prosecution (1957)        4.483882
1049                            Murder, My Sweet (1944)        4.467392
6172          Day for Night (La Nuit Américaine) (1973)        4.464789
6501                                  Umberto D. (1952)        4.461589
9497               Sea Inside, The (Mar adentro) (2004)        4.452890
8937                    Decalogue, The (Dekalog) (1989)        4.450270
