In [6]:
from collections import defaultdict
from datetime import datetime
from math import sqrt

import numpy as np
import pandas as pd

%matplotlib inline

# Read data

In [8]:
def read_ratings_df():
    date_parser = lambda time_in_secs: datetime.utcfromtimestamp(float(time_in_secs))
    return pd.read_csv('ml-latest-small/ratings.csv', parse_dates=['timestamp'], date_parser=date_parser)


class MovieData(object):
    def __init__(self):
        self.ratings_df = read_ratings_df()
        self.ratings = defaultdict(dict)
        self.init_ratings()

    def init_ratings(self):
        for _, row in self.ratings_df.iterrows():
            self.ratings[row['userId']][row['movieId']] = row

    def get_movies(self, user_id):
        return set(self.ratings[user_id].keys())

    def get_shared_ratings(self, user1_id, user2_id):
        movies1 = self.get_movies(user1_id)
        movies2 = self.get_movies(user2_id)

        shared_movies = movies1 & movies2

        ratings = {}

        for movie_id in shared_movies:
            ratings[movie_id] = (
                self.ratings[user1_id][movie_id]['rating'],
                self.ratings[user2_id][movie_id]['rating'],
            )

        return ratings

    @staticmethod
    def shared_ratings_to_np_arrays(shared_ratings):
        return np.array(shared_ratings.values()).T

    def get_euclidean_distance(self, user1_id, user2_id):
        shared_ratings = self.get_shared_ratings(user1_id, user2_id)

        if len(shared_ratings) == 0:
            return 0

        ratings = self.shared_ratings_to_np_arrays(shared_ratings)

        ratings1 = ratings[0]
        ratings2 = ratings[1]

        sum_of_squares = np.power(ratings1 - ratings2, 2).sum()

        return 1 / (1 + sqrt(sum_of_squares))

    def get_manhattan_distance(self, user1_id, user2_id):
        shared_ratings = self.get_shared_ratings(user1_id, user2_id)

        if len(shared_ratings) == 0:
            return 0

        ratings = self.shared_ratings_to_np_arrays(shared_ratings)

        ratings1 = ratings[0]
        ratings2 = ratings[1]

        manhattan_sum = np.abs(ratings1 - ratings2).sum()

        return 1 / (1 + manhattan_sum)

    def get_pearson_correlation(self, user1_id, user2_id):
        shared_ratings = self.get_shared_ratings(user1_id, user2_id)

        num_ratings = len(shared_ratings)

        if num_ratings == 0:
            return 0

        ratings = self.shared_ratings_to_np_arrays(shared_ratings)

        ratings1 = ratings[0]
        ratings2 = ratings[1]

        mean1 = ratings1.mean()
        mean2 = ratings2.mean()

        std1 = ratings1.std()
        std2 = ratings2.std()

        if std1 == 0 or std2 == 0:
            return 0

        std_scores_1 = (ratings1 - mean1) / std1
        std_scores_2 = (ratings2 - mean2) / std2

        # numerically stable calculation of the Pearson correlation coefficient

        return abs((std_scores_1 * std_scores_2).sum() / (num_ratings - 1))

        
movie_data = MovieData()

# Explore shared ratings

In [9]:
def explore_shared_ratings(movie_data):
    unique_user_ids = movie_data.ratings_df['userId'].unique()

    n_pairs = 30
    samples = np.random.choice(unique_user_ids, size=(n_pairs, 2))

    for index, sample in enumerate(samples):
        user1_id = sample[0]
        user2_id = sample[1]

        num_movies_1 = len(movie_data.get_movies(user1_id))
        num_movies_2 = len(movie_data.get_movies(user2_id))

        num_shared_ratings = len(movie_data.get_shared_ratings(user1_id, user2_id))

        print 'pair %2d, user1 movies: %4d, user2 movies: %4d, shared movies: %3d' % (
            index + 1, num_movies_1, num_movies_2, num_shared_ratings)

        
explore_shared_ratings(movie_data)

pair  1, user1 movies:  114, user2 movies:   73, shared movies:  18
pair  2, user1 movies:   92, user2 movies:  129, shared movies:  11
pair  3, user1 movies:   33, user2 movies:   34, shared movies:   1
pair  4, user1 movies:   26, user2 movies:   29, shared movies:   1
pair  5, user1 movies:  132, user2 movies:   39, shared movies:   9
pair  6, user1 movies:  231, user2 movies:   38, shared movies:   4
pair  7, user1 movies:   32, user2 movies:   42, shared movies:   0
pair  8, user1 movies:  437, user2 movies:   51, shared movies:  14
pair  9, user1 movies:   63, user2 movies:  217, shared movies:  11
pair 10, user1 movies:   20, user2 movies:  123, shared movies:   0
pair 11, user1 movies:  187, user2 movies:  183, shared movies:  50
pair 12, user1 movies:   27, user2 movies:   53, shared movies:   1
pair 13, user1 movies:   26, user2 movies:  400, shared movies:   3
pair 14, user1 movies:   45, user2 movies:   20, shared movies:   1
pair 15, user1 movies:  192, user2 movies:   45,

# Explore distances

We are looking at 30 random user pairs. We can notice how small on average is the intersection of the movies they rated (compared to the their total number of ratings).
It's not unusual to see zero intersection or just a couple of movies.

We could build a histogram of the distribution of number of shared movies if we generate a lot of random pairs.

In [5]:
def explore_distances(movie_data):
    unique_user_ids = movie_data.ratings_df['userId'].unique()

    n_pairs = 30
    samples = np.random.choice(unique_user_ids, size=(n_pairs, 2))

    for index, sample in enumerate(samples):
        user1_id = sample[0]
        user2_id = sample[1]

        num_shared_ratings = len(movie_data.get_shared_ratings(user1_id, user2_id))

        euclidean_distance = movie_data.get_euclidean_distance(user1_id, user2_id)
        manhattan_distance = movie_data.get_manhattan_distance(user1_id, user2_id)
        pearson_correlation = movie_data.get_pearson_correlation(user1_id, user2_id)

        print 'pair %2d, shared movies: %3d, euclidean: %.3f, manhattan: %.3f, pearson: %.3f' % (
            index + 1, num_shared_ratings, euclidean_distance, manhattan_distance, pearson_correlation)

        
explore_distances(movie_data)

pair  1, shared movies:   3, euclidean: 0.250, manhattan: 0.167, pearson: 0.416
pair  2, shared movies:   1, euclidean: 0.500, manhattan: 0.500, pearson: 0.000
pair  3, shared movies:   3, euclidean: 0.309, manhattan: 0.250, pearson: 0.750
pair  4, shared movies:   2, euclidean: 0.500, manhattan: 0.500, pearson: 0.000
pair  5, shared movies:  12, euclidean: 0.179, manhattan: 0.071, pearson: 0.397
pair  6, shared movies:  25, euclidean: 0.149, manhattan: 0.038, pearson: 0.454
pair  7, shared movies:   4, euclidean: 0.366, manhattan: 0.250, pearson: 1.333
pair  8, shared movies:  27, euclidean: 0.112, manhattan: 0.029, pearson: 0.100
pair  9, shared movies:   2, euclidean: 0.500, manhattan: 0.500, pearson: 0.000
pair 10, shared movies:   5, euclidean: 0.309, manhattan: 0.250, pearson: 0.000
pair 11, shared movies:  10, euclidean: 0.209, manhattan: 0.095, pearson: 0.045
pair 12, shared movies:   3, euclidean: 0.357, manhattan: 0.286, pearson: 0.000
pair 13, shared movies:  23, euclidean: 

Various distances (euclidean, manhattan, pearson correlation).

Other possible distances: Tantimoto, cosine.

Jaccard distance is not really applicable in this case since we have a range of ratings.