# Recommender System
-- Movie recommendation

In [1]:
import pandas as pd
from surprise import SVD
from surprise import KNNBaseline
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

In [2]:
movie_file = '../data/movies_recommender.csv'
ratings_file = '../data/ratings_recommender.csv'

movies = pd.read_csv(movie_file)
print ('Movie data shape: ', movies.shape)
print ('Movie data columns: ', movies.columns)
print ('Movies head: \n', movies.head(3))

ratings = pd.read_csv(ratings_file)
print ('\n\nRating data shape: ', ratings.shape)
print ('Rating data columns: ', ratings.columns)
print ('Ratings head: \n', ratings.head(3))

Movie data shape:  (9125, 3)
Movie data columns:  Index(['movieId', 'title', 'genres'], dtype='object')
Movies head: 
    movieId                    title  \
0        1         Toy Story (1995)   
1        2           Jumanji (1995)   
2        3  Grumpier Old Men (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  


Rating data shape:  (100004, 4)
Rating data columns:  Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Ratings head: 
    userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182


### Data Merge

In [3]:
df = pd.merge(movies, ratings, on='movieId', how='inner')
print ('Merged dataframe shape: ', df.shape)
df.head(5)

Merged dataframe shape:  (100004, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [4]:
# describe data
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movieId,100004.0,12548.66,26369.2,1.0,1028.0,2406.5,5418.0,163949.0
userId,100004.0,347.0113,195.1638,1.0,182.0,367.0,520.0,671.0
rating,100004.0,3.543608,1.058064,0.5,3.0,4.0,4.0,5.0
timestamp,100004.0,1129639000.0,191685800.0,789652009.0,965847824.0,1110422000.0,1296192000.0,1476641000.0


In [5]:
# get min and max rating
rating_min = (df[['rating']].min())[0]
rating_max = (df[['rating']].max())[0]
print ('Min rating', rating_min)
print ('Max rating', rating_max)

reader = Reader(rating_scale=(rating_min, rating_max))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
print ('Data shape: ', data.df.shape)
data.df.head(5)

Min rating 0.5
Max rating 5.0
Data shape:  (100004, 3)


Unnamed: 0,userId,movieId,rating
0,7,1,3.0
1,9,1,4.0
2,13,1,5.0
3,15,1,2.0
4,19,1,3.0


### Build the model

In [6]:
# Split
trainSet, testSet = train_test_split(data, test_size=.25, random_state=0)

# Model training
algo = SVD(random_state=0)
algo.fit(trainSet)

# Predictions
predictions = algo.test(testSet)

In [7]:
def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)
    
print("RMSE: ", RMSE(predictions))
print("MAE: ", MAE(predictions))

RMSE:  0.8997274890435606
MAE:  0.6939386166639191


In [9]:
def GetTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

In [10]:
topN = GetTopN(predictions)
for i,j in topN.items():
    print ('user_id: ', i)
    for item in j:
        print ('\tmovie_id:', item[0], '\tpredicted rating:', item[1])
    break

user_id:  47
	movie_id: 50 	predicted rating: 4.505059417965697
	movie_id: 296 	predicted rating: 4.381856639343456
	movie_id: 457 	predicted rating: 4.334846228008285
	movie_id: 161 	predicted rating: 4.317298945333891
	movie_id: 32 	predicted rating: 4.266492050615421
	movie_id: 349 	predicted rating: 4.2124639445161005
	movie_id: 588 	predicted rating: 4.025027753810338


### Check one user

In [11]:
user_id = 47

In [12]:
# get all movies watched by this user
df_user = df[df['userId'] == user_id]
df_user = df_user[df_user['rating'] >= 5.0]
df_user = df_user.drop('timestamp', axis =1)
print ('Watched Movies + Rating 5.0')
df_user.head(10)

Watched Movies + Rating 5.0


Unnamed: 0,movieId,title,genres,userId,rating
12,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,47,5.0
252,2,Jumanji (1995),Adventure|Children|Fantasy,47,5.0
3814,110,Braveheart (1995),Action|Drama|War,47,5.0
5089,161,Crimson Tide (1995),Drama|Thriller|War,47,5.0
5571,173,Judge Dredd (1995),Action|Crime|Sci-Fi,47,5.0
8431,292,Outbreak (1995),Action|Drama|Sci-Fi|Thriller,47,5.0
9325,316,Stargate (1994),Action|Adventure|Sci-Fi,47,5.0
13831,457,"Fugitive, The (1993)",Thriller,47,5.0
18034,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,47,5.0


In [14]:
# get all recommendations for this user
movie_list = list()
genre_list = list()
for movie_id,pred_rating in topN[user_id]:
#     print (str(movie_id) + " >> " + str(pred_rating))
    title = movies[movies['movieId'] == movie_id].values[0][1]
    genre = movies[movies['movieId'] == movie_id].values[0][2]
    movie_list.append(title)
    genre_list.append(genre)
data_tuples = list(zip(movie_list,genre_list))
reco_df = pd.DataFrame(data_tuples, columns=['Title','Genre'])
print ('Recommended Movies')
reco_df.head(10)

Recommended Movies


Unnamed: 0,Title,Genre
0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
1,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,"Fugitive, The (1993)",Thriller
3,Crimson Tide (1995),Drama|Thriller|War
4,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
5,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
6,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
