# Recommender systems 

## Collaborative movie recommender

In [196]:
# load Movie lens data
import pandas as pd

# Read ratings
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0, encoding='utf-8', names=['user_id','movie_id','rating','timestamp'])

# Read movies file
movies = pd.read_csv('data/movies.csv', sep=',', header=0, encoding='utf-8', names=['movie_id', 'title', 'genres'])
movies['genres'] = movies['genres'].str.split('|')

tags = pd.read_csv('data/tags.csv', sep=',', header=0, encoding='utf-8', names=['user_id','movie_id','tag','timestamp'])

In [197]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [198]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [199]:
tags.head()

Unnamed: 0,user_id,movie_id,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [200]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [202]:
# Merge ratings and movies
movie_ratings = pd.merge(movies, ratings)
movie_ratings.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0,964982703
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0,847434962
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5,1106635946
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,1510577970
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5,1305696483


### Factorization approach (SVD)

In [105]:
from surprise import SVD, KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import accuracy, evaluate
from surprise.model_selection import train_test_split

# ratings from 0.5 to 5
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(movie_ratings[['user_id', 'movie_id', 'rating']],reader)

algo = SVD()

evaluate(algo, data, measures=['RMSE'], verbose=True)

trainset, testset = train_test_split(data, test_size=.20)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

bsl_options = {'method': 'sgd','n_epochs': 20,}
sim_options = {'name': 'pearson_baseline'}
algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)
evaluate(algo, data, measures=['RMSE'], verbose=True)

Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 0.8780
------------
Fold 2
RMSE: 0.8777
------------
Fold 3
RMSE: 0.8763
------------
Fold 4
RMSE: 0.8763
------------
Fold 5
RMSE: 0.8597
------------
------------
Mean RMSE: 0.8736
------------
------------
RMSE: 0.8857
Evaluating RMSE of algorithm KNNBasic.

------------
Fold 1
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9716
------------
Fold 2
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9764
------------
Fold 3
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9836
------------
Fold 4
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9811
------------
Fold 5
Estimating biases using sgd...
Computing the pearson_base

CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.9716343520054675,
                             0.9764033931170569,
                             0.9835534528025053,
                             0.9811018480853848,
                             0.9722808897604961]})

In [118]:
# Better RMSE
algo = SVD()
algo.fit(trainset)

<surprise.trainset.Trainset object at 0x117a0a7f0>


In [210]:
# then can do predictions according to that, UserId, MovieId and get a possible rating back for the movie
algo.predict(1,23)

Prediction(uid=1, iid=23, r_ui=None, est=4.010624326602049, details={'was_impossible': False})

## Content-based movie recommender

In [212]:
movies['genres'] = movies['genres'].fillna("").astype('str')
print(movies['genres'].head())

0    ['Adventure', 'Animation', 'Children', 'Comedy...
1                 ['Adventure', 'Children', 'Fantasy']
2                                ['Comedy', 'Romance']
3                       ['Comedy', 'Drama', 'Romance']
4                                           ['Comedy']
Name: genres, dtype: object


In [213]:
# put genres into tf-idf and vectorize it
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0)
tfidf_matrix = tf.fit_transform(movies['genres'])

In [214]:
# calculate cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [215]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# get movie recommendations according to cosine similarity
def recommendations(title):
    index = indices[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [216]:
recommendations('Shrek the Third (2007)').head(20)

1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
8927                             The Good Dinosaur (2015)
9430                                         Moana (2016)
5490    Twelve Tasks of Asterix, The (Les douze travau...
6448           TMNT (Teenage Mutant Ninja Turtles) (2007)
8357                                The Lego Movie (2014)
7184                                 Partly Cloudy (2009)
7917                                        Presto (2008)
8273          

In [217]:
# put genres into tf-idf and vectorize it
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0)
tfidf_matrix = tf.fit_transform(movies['genres'])

### New user recommendations

In [166]:
def recommend_high_rated_movies():
    movies_aggregated = movie_ratings.groupby(["title"]).agg({'rating': ['mean', 'count']}).reset_index()
    movies_aggregated.columns = ["title", "mean_rating", "rating_count"]
    return movies_aggregated.sort_values(by=["rating_count","mean_rating"],ascending=False).head(10)

recommend_high_rated_movies()

Unnamed: 0,title,mean_rating,rating_count
3158,Forrest Gump (1994),4.164134,329
7593,"Shawshank Redemption, The (1994)",4.429022,317
6865,Pulp Fiction (1994),4.197068,307
7680,"Silence of the Lambs, The (1991)",4.16129,279
5512,"Matrix, The (1999)",4.192446,278
8001,Star Wars: Episode IV - A New Hope (1977),4.231076,251
4662,Jurassic Park (1993),3.75,238
1337,Braveheart (1995),4.031646,237
8363,Terminator 2: Judgment Day (1991),3.970982,224
7421,Schindler's List (1993),4.225,220


In [206]:
def get_random_Recommendations():
    return movie_ratings.sample(n=5)

get_random_Recommendations()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
41473,2281,Monument Ave. (1998),"[Action, Crime, Drama]",599,2.5,1498517222
4162,150,Apollo 13 (1995),"[Adventure, Drama, IMAX]",327,4.5,1234788741
72953,6936,Elf (2003),"[Children, Comedy, Fantasy]",308,2.0,1421375240
79970,33679,Mr. & Mrs. Smith (2005),"[Action, Adventure, Comedy, Romance]",318,3.0,1261343280
29304,1289,Koyaanisqatsi (a.k.a. Koyaanisqatsi: Life Out ...,[Documentary],50,3.0,1514239232
