# Install Package

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 5.5 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=3367027 sha256=135636584a41b316571987786bce128689108af11f4e614ae629ca45c741c23b
  Stored in directory: /home/team12/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


# Import Packages

In [1]:
from surprise import Dataset
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import SVD
from surprise.model_selection import GridSearchCV

# Dataset

In [2]:
df = pd.read_csv("/home/rwilson2/data/ratings.csv")[["user_id","movie","rating"]]

In [3]:
df.info()
print("rating range:",set(df["rating"]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907085 entries, 0 to 907084
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user_id  907085 non-null  int64 
 1   movie    907085 non-null  object
 2   rating   907085 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 20.8+ MB
rating range: {1, 2, 3, 4, 5}


# Train SVD model

In [5]:
reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user_id", "movie", "rating"]], reader)
# Train
trainingSet = data.build_full_trainset()
print("Created training set")

"""
lr_all is the learning rate for all parameters (how much the parameters are adjusted in each iteration)
reg_all is the regularization term for all parameters, which is a penalty term added to prevent overfitting.
"""
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}

# Get the best params using GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(data)
best_params = gs.best_params["rmse"]

# Extract and train model with best params
svd_algo = SVD(n_epochs=best_params['n_epochs'],
               lr_all=best_params['lr_all'],
               reg_all=best_params['reg_all'])
svd_algo.fit(trainingSet)
print("Training SVD model done")

Created training set
Training SVD model done


In [20]:
# Predict EX
prediction = svd_algo.predict('523475', "prison+break+the+final+break+2009")
print (prediction.est)

3.552702378320729


In [79]:
user_ids = df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = df["movie"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

df["user"] = df["user_id"].map(user2user_encoded)
df["movie_id"] = df["movie"].map(movie2movie_encoded)

num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(df["rating"])
max_rating = max(df["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 516020, Number of Movies: 26749, Min rating: 1.0, Max rating: 5.0


# Get the recommendation for a sample user

In [22]:
# Let us get a sample user and see the top recommendations.
sample_user = df.user_id.sample(1).iloc[0]
movies_watched_by_user = df[df.user_id == sample_user]
for i in movies_watched_by_user.index:
    print(str(sample_user) + " watched: " + str(movies_watched_by_user["movie"][i]) + "(movie_id: "+ str(movies_watched_by_user["movie_id"][i]) +")" )


911436 watched: the+shawshank+redemption+1994(movie_id: 21)
911436 watched: shadow+of+the+thin+man+1941(movie_id: 4674)
911436 watched: girls+gone+dead+2012(movie_id: 11339)


In [23]:
movies_watched_by_user

Unnamed: 0,user_id,movie,rating,user,movie_id
630776,911436,the+shawshank+redemption+1994,4.0,421313,21
698900,911436,shadow+of+the+thin+man+1941,4.0,421313,4674
900171,911436,girls+gone+dead+2012,4.0,421313,11339


In [24]:
# Get movie_id of movies_not_watched
movies_not_watched = df[~df["movie_id"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.values())))
# Get user_id of the sample user
user_encoder = user2user_encoded.get(sample_user)
# Let the model predict the rating of the movies in movies_not_watched
ratings = np.array([svd_algo.predict(sample_user,movie_encoded2movie[i]).est for i in movies_not_watched])
# Get the top 20 movies
top_ratings_indices = ratings.argsort()[-20:][::-1]
recommended_movies = [(movie_encoded2movie.get(movies_not_watched[x]),ratings[x]) for x in top_ratings_indices]


In [28]:
print("Recommended items for user", sample_user, ":")
print("\n")
for movie, score in recommended_movies:
    print("\t", movie, ":", float(round(score)))

Recommended items for user 911436 :


	 louis+c.k.+oh+my+god+2013 : 4.0
	 rear+window+1954 : 4.0
	 jackie+brown+1997 : 4.0
	 the+empire+strikes+back+1980 : 4.0
	 the+civil+war+1990 : 4.0
	 the+godfather+1972 : 4.0
	 kill+bill+vol.+2+2004 : 4.0
	 django+unchained+2012 : 4.0
	 prime+suspect+1991 : 4.0
	 star+wars+1977 : 4.0
	 the+great+escape+1963 : 4.0
	 the+godfather+part+ii+1974 : 4.0
	 the+lord+of+the+rings+the+two+towers+2002 : 4.0
	 the+usual+suspects+1995 : 4.0
	 one+man+band+2005 : 4.0
	 death+proof+2007 : 4.0
	 pulp+fiction+1994 : 4.0
	 casablanca+1942 : 4.0
	 the+wrong+trousers+1993 : 4.0
	 the+intouchables+2011 : 4.0


# Save and load recommendation algo

In [7]:
from surprise import dump

# Save the algorithm to disk
dump.dump('SVD_model.dump', algo=svd_algo)

# Load and use (for deployment team)

In [18]:
from surprise import dump

# Save the algorithm to disk
_, algo = dump.load('SVD_model.dump')

In [29]:
# Get the remapped user ID
df = pd.read_csv("/home/rwilson2/data/ratings.csv")[["user_id","movie","rating"]]
test_user = df.user_id.sample(1).iloc[0]

user_ids = df["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

movie_ids = df["movie"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

df["user"] = df["user_id"].map(user2user_encoded)
df["movie_id"] = df["movie"].map(movie2movie_encoded)

df["rating"] = df["rating"].values.astype(np.float32)
movies_watched_by_user = df[df.user_id == test_user]

# Get movie_id of movies_not_watched
movies_not_watched = df[~df["movie_id"].isin(movies_watched_by_user.movie_id.values)]["movie_id"]
movies_not_watched = list(set(movies_not_watched).intersection(set(movie2movie_encoded.values())))
# Get user_id of the sample user
user_encoder = user2user_encoded.get(test_user)
# Let the model predict the rating of the movies in movies_not_watched
ratings = np.array([algo.predict(test_user,movie_encoded2movie[i]).est for i in movies_not_watched])
# Get the top 20 movies
top_ratings_indices = ratings.argsort()[-20:][::-1]
recommended_movies = [(movie_encoded2movie.get(movies_not_watched[x]),ratings[x]) for x in top_ratings_indices]

print("Recommended items for user", test_user, ":")
print("\n")
for movie, score in recommended_movies:
    print("\t", movie, ":", float(round(score)))

Recommended items for user 427553 :


	 the+shawshank+redemption+1994 : 4.0
	 rear+window+1954 : 4.0
	 louis+c.k.+oh+my+god+2013 : 4.0
	 the+godfather+1972 : 4.0
	 the+empire+strikes+back+1980 : 4.0
	 the+civil+war+1990 : 4.0
	 the+century+of+the+self+2002 : 4.0
	 le+trou+1960 : 4.0
	 fargo+1996 : 4.0
	 the+intouchables+2011 : 4.0
	 casablanca+1942 : 4.0
	 star+wars+1977 : 4.0
	 the+usual+suspects+1995 : 4.0
	 the+revolution+will+not+be+televised+2003 : 4.0
	 the+lord+of+the+rings+the+two+towers+2002 : 4.0
	 prime+suspect+1991 : 4.0
	 to+kill+a+mockingbird+1962 : 4.0
	 for+the+birds+2000 : 4.0
	 the+lord+of+the+rings+the+fellowship+of+the+ring+2001 : 4.0
	 bill+hicks+revelations+1993 : 4.0


# Reference

https://github.com/ckaestne/seai/blob/S2020/recitations/06_Collaborative_Filtering.ipynb