### Imports

In [1]:
from math import sqrt
import pandas as pd
from os import path

### Read and Create Database

In [2]:
DATA_PATH = path.dirname(path.realpath("__file__"))
DATA = path.join(DATA_PATH, 'data')

In [3]:
def read_data(data_path):
    data_ratings = pd.read_csv(data_path + r'\ratings.csv')
    data_movies = pd.read_csv(data_path + r'\movies.csv')
    data = pd.merge(left = data_ratings, right = data_movies, on = 'movieId')
    data.drop(['timestamp', 'genres'], axis = 1, inplace = True)
    return data

data = read_data(DATA)

In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"


We can generate two ```dict``` database structures:

1. The first contains the users as a primary key, followed by each movie that the given user saw and its rating. 
The first one will be used to compute the User-Based approach.
2. The second contains the movie as primary key, followed by each user that saw the movie and its rating. 
The second one will be used to compute the Item-Based approach.

_Note: We could also work with a MultiIndex Pandas DataFrame._

In [5]:
def generate_dicts(data):
    users_dict = data.groupby('userId')[['title','rating']].apply(lambda x: dict(x.values.tolist())).to_dict()
    movies_dict = data.groupby('title')[['userId','rating']].apply(lambda x: dict(x.values.tolist())).to_dict()
    return users_dict, movies_dict

users_dict, movies_dict = generate_dicts(data)

In [18]:
# Users dict containing each user, the movies rated, and the rate given
dict(list(users_dict.items())[1:3])

{2: {'Shawshank Redemption, The (1994)': 3.0,
  'Tommy Boy (1995)': 4.0,
  'Good Will Hunting (1997)': 4.5,
  'Gladiator (2000)': 4.0,
  'Kill Bill: Vol. 1 (2003)': 4.0,
  'Collateral (2004)': 3.5,
  'Talladega Nights: The Ballad of Ricky Bobby (2006)': 4.0,
  'Departed, The (2006)': 4.0,
  'Dark Knight, The (2008)': 4.5,
  'Step Brothers (2008)': 5.0,
  'Inglourious Basterds (2009)': 4.5,
  'Zombieland (2009)': 3.0,
  'Shutter Island (2010)': 4.0,
  'Exit Through the Gift Shop (2010)': 3.0,
  'Inception (2010)': 4.0,
  'Town, The (2010)': 4.5,
  'Inside Job (2010)': 5.0,
  'Louis C.K.: Hilarious (2010)': 4.0,
  'Warrior (2011)': 5.0,
  'Dark Knight Rises, The (2012)': 3.5,
  'Girl with the Dragon Tattoo, The (2011)': 2.5,
  'Django Unchained (2012)': 3.5,
  'Wolf of Wall Street, The (2013)': 5.0,
  'Interstellar (2014)': 3.0,
  'Whiplash (2014)': 4.0,
  'The Drop (2014)': 2.0,
  'Ex Machina (2015)': 3.5,
  'Mad Max: Fury Road (2015)': 5.0,
  'The Jinx: The Life and Deaths of Robert Du

In [21]:
# Movies dict containing each movie, the userId and the given rating
dict(list(movies_dict.items())[2001:2004])

{'Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001)': {91.0: 4.5,
  167.0: 5.0,
  184.0: 4.0,
  187.0: 3.5,
  219.0: 3.5,
  232.0: 2.5,
  260.0: 3.5,
  298.0: 4.5,
  320.0: 3.5,
  371.0: 4.5,
  387.0: 3.5,
  414.0: 4.0,
  428.0: 3.5,
  438.0: 4.5,
  477.0: 4.5,
  483.0: 4.5,
  522.0: 4.0,
  580.0: 2.0,
  599.0: 4.0,
  610.0: 4.5},
 'Cowboy Way, The (1994)': {217.0: 3.0,
  234.0: 3.0,
  294.0: 3.0,
  368.0: 2.0,
  414.0: 2.0,
  555.0: 4.0,
  599.0: 2.0},
 'Cowboys & Aliens (2011)': {246.0: 4.0,
  249.0: 3.0,
  292.0: 3.0,
  298.0: 2.0,
  339.0: 3.0,
  380.0: 4.0,
  448.0: 1.5,
  534.0: 3.5,
  610.0: 3.0}}

### Search for similar users

* Compute Euclidean Distance
* Use the Euclidian Distance to search for the most similar users

\begin{equation}
    D(x, y) = \sqrt{\sum_{i}^{n} (x_{i} - y_{i})^2}
\end{equation}

In [23]:
def euclidian_distance(data, user1, user2):
    # Stores the similarity of the movies watched by both users
    similarity = {}
    # Checks which movies the users saw 
    for movie in data[user1]:
        if movie in data[user2]:
            similarity[movie] = 1
    # If the users didn't watch any common movie, returns zero
    if len(similarity) == 0:
        return 0
    # Calculates the similarity of the movies watched by both users based on the Euclidean Distance
    summation = sum([pow(data[user1][movie] - data[user2][movie], 2)
               for movie in data[user1] if movie in data[user2]]) 
    return 1 / (1 + sqrt(summation))

In [24]:
def get_similar(data, user):
    # Returns the similarity of one given user compared with all other users stored in the database
    similarity = [(euclidian_distance(data, user, other_user), other_user) 
                  for other_user in data if other_user != user]
    similarity.sort()
    similarity.reverse()
    return similarity[:10]

In [27]:
# Computes the similarity of the user 1 and 2
print(f"Euclidian Distance (similarity) between the users 1 and 2: {euclidian_distance(users_dict, 1, 2)}")

Euclidian Distance (similarity) between the users 1 and 2: 0.4142135623730951


In [28]:
# Returns the score of similarity and the 10 most similar users compared to the user 2
sim_user2 = get_similar(users_dict, 2)
pd.DataFrame(sim_user2, columns = ['Similarity', 'userId'])

Unnamed: 0,Similarity,userId
0,1.0,538
1,1.0,533
2,1.0,496
3,1.0,468
4,1.0,452
5,1.0,427
6,1.0,379
7,1.0,361
8,1.0,359
9,1.0,333


### Recommender System: User-Based Collaborative Filtering

* Identifies recommended movies based on the similarity with other users

In [30]:
def get_recommended_movies_ub(data, user):
    """
    This function computes the User-Based approach to recommend a movie.
    It takes as input a database and an user, and tries to predict what movies will be recommended based on the ratings of the other users.
    """
    # Sum of the ratings given by each user multiplied by the similarity with the user
    total = {}
    # Sum of the similarity with a user given that he rated the movie
    similarity_sum = {}

    for other_users in data:
        if other_users == user:
            continue
        # Calculate the similarity of the user with the other users in the database
        similarity = euclidian_distance(data, user, other_users)
        if similarity <= 0:
            continue
        # Checks each movie watched by the others that the user didn't watch
        for movie in data[other_users]:
            if movie not in data[user]:
                total.setdefault(movie, 0)
                total[movie] += data[other_users][movie] * similarity
                similarity_sum.setdefault(movie, 0)
                similarity_sum[movie] += similarity        
    # Returns the predicted rate for the movies
    rankings = [(score / similarity_sum[movie], movie) for movie, score in total.items()]
    rankings.sort()
    rankings.reverse()
    return rankings[:10]

In [31]:
# Returns the top 10 most recommended movies for the user 10 using the User-Based approach
movies_user10_ub = get_recommended_movies_ub(users_dict, 10)
pd.DataFrame(movies_user10_ub, columns = ['Similarity', 'Movie Recommendations'])

Unnamed: 0,Similarity,Movie Recommendations
0,5.0,King of Hearts (1966)
1,5.0,Come and See (Idi i smotri) (1985)
2,5.0,Zeitgeist: Moving Forward (2011)
3,5.0,Wow! A Talking Fish! (1983)
4,5.0,World of Glory (1991)
5,5.0,Wonder Woman (2009)
6,5.0,Won't You Be My Neighbor? (2018)
7,5.0,"Woman Under the Influence, A (1974)"
8,5.0,"Woman Is a Woman, A (femme est une femme, Une)..."
9,5.0,Winter in Prostokvashino (1984)


### Recommender System: Item-Based Collaborative Filtering

* Identifies recommended movies based on the similarity of the movies
* **Precompute the most similar movies and store these values** 
* The calculations are done in advance, in order to return assessments faster
* Weighted average of similar movies

In [32]:
def get_similar_movies(base):
    # Returns the similarity of one given movie with all the other movies stored in the database
    result = {}
    for movie in base:
        ratings = get_similar(base, movie)
        result[movie] = ratings
    return result

In [33]:
# Computes the similarity between the movies
similar_movies = get_similar_movies(movies_dict)

In [35]:
# Movies similar to each other
dict(list(similar_movies.items())[1:3])

{"'Hellboy': The Seeds of Creation (2004)": [(1.0, 'Watchmen (2009)'),
  (1.0, 'Usual Suspects, The (1995)'),
  (1.0, 'Toy Story 3 (2010)'),
  (1.0, 'Toy Story (1995)'),
  (1.0, 'Throne of Blood (Kumonosu jô) (1957)'),
  (1.0, 'Three Kings (1999)'),
  (1.0, "There's Something About Mary (1998)"),
  (1.0, 'There Will Be Blood (2007)'),
  (1.0, 'Terminator, The (1984)'),
  (1.0, 'Taxi Driver (1976)')],
 "'Round Midnight (1986)": [(1.0, 'Zodiac (2007)'),
  (1.0, 'X2: X-Men United (2003)'),
  (1.0, 'X-Men: First Class (2011)'),
  (1.0, 'X-Men (2000)'),
  (1.0, 'Whole Nine Yards, The (2000)'),
  (1.0, 'Unstoppable (2010)'),
  (1.0, 'Underworld: Awakening (2012)'),
  (1.0, 'Traffic (2000)'),
  (1.0, 'Thor (2011)'),
  (1.0, 'The Hunger Games (2012)')]}

In [36]:
def get_recommended_movies_ib(data, movies_similarity, user):
    """
    This function computes the Item-Based approach to recommend a movie.
    It takes as input a database, the similarity score of the movies (already pre-computed) and an user, and tries to predict what movies will be recommended.
    """
    # Get the ratings that the user (specified as a parameter) gave to the movies watched by him/her    
    user_ratings = data[user]

    ratings = {}
    total_similarity = {}

    # Get the movies in the database that the user (specified as a parameter) attributed a rating
    for (movie, rating) in user_ratings.items():
        # Calculates the similarity of the movies watched by the user with the other movies in the database
        for (similarity, movie2) in movies_similarity[movie]:
            # Do not calculate the similiraty of a movie with itself
            if movie2 in user_ratings:
                continue
            # Attribute a rate to the movies that the user didn't watch
            ratings.setdefault(movie2, 0)
            # Multiplies the similarity of a given movie with the ratings given by the user
            ratings[movie2] += similarity * rating 
            # Calculates the similarity of all the movies
            total_similarity.setdefault(movie2, 0)
            total_similarity[movie2] += similarity
    
    # Returns the predicted rating for the movies, in order to recommend a movie
    rankings = [(score / total_similarity[movie], movie) for movie, score in ratings.items()]
    rankings.sort()
    rankings.reverse()
    return rankings[:10]

In [39]:
# Returns the top 10 most recommended movies for the user 10 using the Item-Based approach
movies_user10_ib = get_recommended_movies_ib(users_dict, similar_movies, 10.0)
pd.DataFrame(movies_user10_ib, columns = ['Similarity', 'Movie Recommendations'])

Unnamed: 0,Similarity,Movie Recommendations
0,5.0,Zookeeper (2011)
1,5.0,Winter in Prostokvashino (1984)
2,5.0,Winnie the Pooh and the Day of Concern (1972)
3,5.0,Winnie the Pooh Goes Visiting (1971)
4,5.0,Winnie Pooh (1969)
5,5.0,"Wings, Legs and Tails (1986)"
6,5.0,Wild Wild West (1999)
7,5.0,Wild Hogs (2007)
8,5.0,Wanted (2008)
9,5.0,"Wake Up, Ron Burgundy (2004)"
