In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the MovieLens data

You can download the file ml_latest.zip [here](https://grouplens.org/datasets/movielens/) and then unzip into the data/ directory.

In [6]:
!ls ml-latest/

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
# Read dataframes
df_movies = pd.read_csv('ml-latest/movies.csv')
df_links = pd.read_csv('ml-latest/links.csv')
df_ratings = pd.read_csv('ml-latest/ratings.csv')
df_genome_tags = pd.read_csv('ml-latest/genome-tags.csv')
df_genome_scores = pd.read_csv('ml-latest/genome-scores.csv')

# Merge scores and tags
df_movie_tags_in_text = pd.merge(df_genome_scores, df_genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

# Only keep tags with relevance higher than 0.3
df_movie_tags = df_genome_scores[df_genome_scores.relevance > 0.3][['movieId', 'tagId']]

## Which is movie with Id 1?

In [8]:
df_movies[df_movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [10]:
df_movies.shape

(58098, 3)

In [12]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Let's have a look at a few of the tags Toy Story

In [13]:
df_movie_tags[df_movie_tags['movieId'] == 1].merge(df_genome_tags, on='tagId').sample(10)

Unnamed: 0,movieId,tagId,tag
69,1,412,friendship
163,1,992,suprisingly clever
0,1,11,3d
135,1,785,pixar
117,1,669,morality
63,1,378,fantasy world
62,1,377,fantasy
122,1,742,original
128,1,752,oscar (best effects - visual effects)
71,1,415,fun movie


In [15]:
df_movie_tags.shape

(1471046, 2)

In [16]:
df_movie_tags.head(5)

Unnamed: 0,movieId,tagId
10,1,11
18,1,19
20,1,21
21,1,22
28,1,29


# Encode features

In [22]:
df_tags_to_movies = pd.merge(df_movie_tags, df_genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
df_tags_to_movies['tagId'] = df_tags_to_movies.tagId.astype(str)

In [25]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [26]:
df_tags_per_movie = df_tags_to_movies.groupby('movieId')['tagId'].agg(_concatenate_tags_of_movie)
df_tags_per_movie.name = 'movie_tags'
df_tags_per_movie = df_tags_per_movie.reset_index()

In [27]:
df_tags_per_movie[df_tags_per_movie['movieId'] == 1]

Unnamed: 0,movieId,movie_tags
0,1,465 1093 11 1062 867 449 505 310 554 1071 536 ...


In [28]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratingsdf_tags_per_movie']
df_avg_ratings = df_avg_ratings.reset_index()

In [38]:
df_avg_ratings.head(5)

Unnamed: 0,movieId,rating_mean,rating_median,num_ratingsdf_tags_per_movie
0,1,3.886649,4.0,68469
1,2,3.246583,3.0,27143
2,3,3.173981,3.0,15585
3,4,2.87454,3.0,2989
4,5,3.077291,3.0,15474


In [29]:
df_movies_with_ratings = pd.merge(df_movies, df_avg_ratings, how='left', on='movieId')

In [39]:
df_movies_with_ratings.head(5)

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.886649,4.0,68469.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.246583,3.0,27143.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.173981,3.0,15585.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.87454,3.0,2989.0
4,5,Father of the Bride Part II (1995),Comedy,3.077291,3.0,15474.0


In [30]:
df_data = pd.merge(df_movies_with_ratings, df_tags_per_movie, how='left', on='movieId')

In [31]:
df_data_with_tags = df_data[~df_data.movie_tags.isnull()].reset_index(drop=True)

In [35]:
df_data_with_tags.shape

(13176, 7)

In [37]:
df_data_with_tags.head(5)

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.886649,4.0,68469.0,465 1093 11 1062 867 449 505 310 554 1071 536 ...
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.246583,3.0,27143.0,890 1093 867 584 554 694 776 468 464 719 663 9...
2,3,Grumpier Old Men (1995),Comedy|Romance,3.173981,3.0,15585.0,403 397 465 543 412 1050 901 1116 867 505 1071...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.87454,3.0,2989.0,613 97 313 412 1116 900 28 1062 100 188 1070 9...
4,5,Father of the Bride Part II (1995),Comedy,3.077291,3.0,15474.0,849 524 138 1040 313 1116 900 901 278 867 694 ...


# TF-IDF vectors

In [32]:
tf_idf = TfidfVectorizer()

In [33]:
df_movies_tf_idf_described = tf_idf.fit_transform(df_data_with_tags.movie_tags)

In [44]:
m2m = cosine_similarity(df_movies_tf_idf_described)

In [45]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(df_movies_tf_idf_described))

In [46]:
index_to_movie_id = df_data_with_tags['movieId']

In [47]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]

In [48]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]

In [49]:
df_tfidf_m2m.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,184987,184997,185029,185135,185425,185435,185585,186587,187593,187595
1,1.0,0.359995,0.140584,0.163904,0.197146,0.267026,0.240104,0.233925,0.075557,0.223134,...,0.231415,0.323718,0.449159,0.415062,0.115754,0.126076,0.154487,0.238729,0.376306,0.339774
2,0.359995,1.0,0.116658,0.123059,0.119013,0.090835,0.215883,0.221415,0.167558,0.22194,...,0.309822,0.231912,0.207119,0.253158,0.151519,0.122537,0.183769,0.244879,0.20468,0.287186
3,0.140584,0.116658,1.0,0.192486,0.407801,0.090215,0.246536,0.151995,0.077091,0.142224,...,0.118169,0.198064,0.173156,0.146563,0.090056,0.131115,0.226738,0.147758,0.202069,0.204408
4,0.163904,0.123059,0.192486,1.0,0.278716,0.07574,0.334642,0.200485,0.049504,0.079378,...,0.151011,0.195374,0.211978,0.181477,0.214305,0.168674,0.164492,0.162263,0.170269,0.199378
5,0.197146,0.119013,0.407801,0.278716,1.0,0.085531,0.309019,0.151632,0.067623,0.109039,...,0.14701,0.264331,0.18241,0.163857,0.117392,0.133729,0.15714,0.148886,0.199741,0.16228


# Most similar movies to Toy Story

In [52]:
df_tfidf_m2m.iloc[0].sort_values(ascending=False)[:10]

1        1.000000
3114     0.785583
4886     0.773566
78499    0.747440
2355     0.728273
6377     0.705095
68954    0.668650
8961     0.660274
50872    0.653690
4306     0.652549
Name: 1, dtype: float64

In [53]:
df_data_with_tags[df_data_with_tags.movieId == 3114]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
2809,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.809977,4.0,29820.0,1062 867 505 310 121 554 536 1071 776 170 468 ...


In [54]:
df_data_with_tags[df_data_with_tags.movieId == 4886]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
4406,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.853349,4.0,37112.0,1093 11 1062 867 505 310 121 554 536 886 1071 ...


## Cosine similarity for Terminator 2

The costine similarity between Toy Story 2 and Terminator 2

In [55]:
df_data_with_tags[df_data_with_tags.title.str.contains('Terminator 2')]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
561,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.941501,4.0,64258.0,890 221 465 1093 1085 1062 482 449 867 891 143...


In [56]:
df_tfidf_m2m.iloc[1][555]

0.022083210802081206

## Build user profile for user #1

In [57]:
df_user_ratings = df_ratings[df_ratings.userId == 1]

In [58]:
df_user_data_with_tags = df_data_with_tags.reset_index().merge(df_user_ratings, on='movieId')

In [59]:
df_user_data_with_tags[['title', 'rating']]

Unnamed: 0,title,rating
0,Three Colors: Blue (Trois couleurs: Bleu) (1993),3.5
1,Kalifornia (1993),3.5
2,Weekend at Bernie's (1989),1.5
3,Better Off Dead... (1985),4.5
4,Waiting for Guffman (1996),4.5
5,Event Horizon (1997),2.5
6,Spawn (1997),1.5
7,Weird Science (1985),4.5
8,¡Three Amigos! (1986),4.0
9,Stigmata (1999),3.0


In [60]:
df_user_data_with_tags['weight'] = df_user_data_with_tags['rating']/5.

In [61]:
user_profile = np.dot(df_movies_tf_idf_described[df_user_data_with_tags['index'].values].toarray().T, df_user_data_with_tags['weight'].values)

In [62]:
C = cosine_similarity(atleast_2d(user_profile), df_movies_tf_idf_described)

In [63]:
R = argsort(C)[:, ::-1]

In [64]:
recommendations = [i for i in R[0] if i not in df_user_data_with_tags['index'].values]

In [65]:
df_data_with_tags['title'][recommendations].head(10)

2240                                Office Space (1999)
1162                                    Heathers (1989)
1351                         Grosse Pointe Blank (1997)
9912                                  Zombieland (2009)
6108             Battle Royale (Batoru rowaiaru) (2000)
7052                           Shaun of the Dead (2004)
13157                               Isle of Dogs (2018)
9159                                   In Bruges (2008)
12136                     Beastie Boys: Sabotage (1994)
5400     Monty Python Live at the Hollywood Bowl (1982)
Name: title, dtype: object