# Recommender systems 

## Collaborative movie recommender

In [1]:
# load Movie lens data
import pandas as pd

# Read ratings
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0, encoding='utf-8', names=['user_id','movie_id','rating','timestamp'])

# Read movies file
movies = pd.read_csv('data/movies.csv', sep=',', header=0, encoding='utf-8', names=['movie_id', 'title', 'genres'])
movies['genres'] = movies['genres'].str.split('|')

tags = pd.read_csv('data/tags.csv', sep=',', header=0, encoding='utf-8', names=['user_id','movie_id','tag','timestamp'])

In [2]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [4]:
tags.head()

Unnamed: 0,user_id,movie_id,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

In [6]:
# Merge ratings and movies
movie_ratings = pd.merge(movies, ratings)
movie_ratings.head()

Unnamed: 0,movie_id,title,genres,user_id,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0,964982703
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0,847434962
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5,1106635946
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,1510577970
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5,1305696483


### Factorization approach (SVD)

In [7]:
from surprise import SVD, KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split, cross_validate

# ratings from 0.5 to 5
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(movie_ratings[['user_id', 'movie_id', 'rating']],reader)

algo = SVD()

cross_validate(algo, data, measures=['RMSE'], verbose=True)

bsl_options = {'method': 'sgd','n_epochs': 20,}
sim_options = {'name': 'pearson_baseline'}
algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)
cross_validate(algo, data, measures=['RMSE'], verbose=True)

KeyboardInterrupt: 

In [None]:
# Better RMSE
trainset, testset = train_test_split(data, test_size=.20)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

In [None]:
# then can do predictions according to that, UserId, MovieId and get a possible rating back for the movie
algo.predict(1,23)

## Content-based movie recommender

In [None]:
movies['genres'] = movies['genres'].fillna("").astype('str')
print(movies['genres'].head())

In [None]:
# put genres into tf-idf and vectorize it
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0)
tfidf_matrix = tf.fit_transform(movies['genres'])

In [None]:
# calculate cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# get movie recommendations according to cosine similarity
def recommendations(title):
    index = indices[title]
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
recommendations('Shrek the Third (2007)').head(20)

In [None]:
# put genres into tf-idf and vectorize it
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0)
tfidf_matrix = tf.fit_transform(movies['genres'])

### New user recommendations

In [None]:
def recommend_high_rated_movies():
    movies_aggregated = movie_ratings.groupby(["title"]).agg({'rating': ['mean', 'count']}).reset_index()
    movies_aggregated.columns = ["title", "mean_rating", "rating_count"]
    return movies_aggregated.sort_values(by=["rating_count","mean_rating"],ascending=False).head(10)

recommend_high_rated_movies()

In [None]:
def get_random_Recommendations():
    return movie_ratings.sample(n=5)

get_random_Recommendations()