The task would be to recommend movies to the user based on him/her given movies.

Movies will be given by title.

In [2]:
import numpy as np
import pandas as pd

Start by inspecting our dataset

In [3]:
links_df = pd.read_csv('data/links.csv')
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies_df = pd.read_csv('data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df = pd.read_csv('data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_df = pd.read_csv('data/tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Let's go on the assumption that if person $A$ likes movies $M_0, M_1, ..., M_i$,
then there goes person $B$ who likes one or more movies from $M_i$ let's call them $M_j$.

This would mean that $A$ and $B$ has a movie that they both liked, therefore other movies from both $M_j$ and $M_i$ can be liked by both $A$ and $B$ with high probability.

------------------

In [11]:
df = movies_df.merge(ratings_df, on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [12]:
M_j = 'Andrew Dice Clay: Dice Rules (1991)' # Title as input, now it's just one movie
recommended_movies = []

# Find the movie in the database, and sort it by rating
movie_db = df[df['title'] == M_j]\
            .sort_values(by='rating', ascending=False)

# Get the first 5 users who liked this movie
for user in movie_db.iloc[:5]['userId'].values:
    
    # Get the rated movies for this user
    rated_movies = df[df['userId'] == user]
    
    # Get the five biggest rated movie by this user
    rated_movies = rated_movies[rated_movies['title'] != M_j]\
                    .sort_values(by='rating', ascending=False)\
                    .iloc[:5]
    
    # Add these to the recommendations
    recommended_movies.extend(list(rated_movies['title'].values))
    
recommended_movies = np.unique(recommended_movies)
    
for movie in recommended_movies:
    print(movie)

Best in Show (2000)
Django Unchained (2012)
Pay It Forward (2000)
Superbad (2007)
Whiplash (2014)


Now weight each movie by the similiarity on the genre feature

In [13]:
gmovie_genres = df[df['title'] == M_j].iloc[0]['genres'].split('|')
scores = {}  # {title: score ...}

for movie in recommended_movies:
    movied = df[df['title'] == movie].iloc[0]
    movie_genres = movied['genres'].split('|')
    score = 0
    
    # How many gmovie_genre can be found in movie_genres?
    for gmovie_genre in gmovie_genres:
        if gmovie_genre in movie_genres:
            score += 1
    
    scores[movie] = score
    
# Sort them on score and reverse it, because the bigger the score the better 
recommended_movies = sorted(scores, key=lambda x: scores[x])[::-1]  

The recommendations are now weighted

In [14]:
for movie in recommended_movies:
    print(movie)

Superbad (2007)
Best in Show (2000)
Whiplash (2014)
Pay It Forward (2000)
Django Unchained (2012)


For implementation, see the `rmovie.py` file