In [1]:
import math
import random
import numpy as np
import pandas as pd

from operator import itemgetter
from collections import defaultdict
from sklearn.model_selection import cross_validate as cv
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
# preset

dataset_path = 'ml-100k'
test_size = 0.2
model_type = 'Random'

In [3]:
df = pd.read_csv('ml-data/ratings.csv', sep=',')
movie_titles = pd.read_csv('ml-data/movies.csv')

df = pd.merge(df, movie_titles, on='movieId')

In [4]:
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 610 | Number of movies = 9724


In [5]:
# split test train

train, test = defaultdict(dict), defaultdict(dict)
trainset_len = 0
testset_len = 0
for _, row in df.iterrows():
    user, movie, rate = row['userId'], row['movieId'], row['rating']
    
    if random.random() <= test_size:
        test[user][movie] = int(rate)
        testset_len += 1
    else:
        train[user][movie] = int(rate)
        trainset_len += 1
        
print('train set size = %s' % trainset_len)
print('test set size = %s\n' % testset_len)

train set size = 80611
test set size = 20225



In [6]:
def calculate_movie_popular(trainset):
    movie_popular = defaultdict(int)

    for user, movies in trainset.items():
        for movie in movies:
            # count item popularity
            movie_popular[movie] += 1
            
    movie_count = len(movie_popular)
    print('total movie number = %d' % movie_count)
    return movie_popular, movie_count

In [7]:
def create_model(trainset, n_rec_movie):

    movie_popular, movie_count = calculate_movie_popular(trainset)
    total_movies = list(movie_popular.keys())
    movie_popular_sort = sorted(movie_popular.items(), key=itemgetter(1), reverse=True)
    
    return movie_popular, movie_count, movie_popular_sort

In [8]:
def recommend(trainset, user, n_rec_movie):

    N = n_rec_movie
    
    # Recommend N most popular movies for the user.
    predict_movies = list()
    watched_movies = trainset[user]
    for movie, _ in movie_popular_sort:
        if len(predict_movies) < N and movie not in watched_movies:
            predict_movies.append(movie)
    return predict_movies

In [9]:
def score_test(trainset, testset, n_rec_movie):
    
    N = n_rec_movie
    
    # keep track
    hit = 0
    rec_count = 0
    test_count = 0
    
    # varables for coverage
    all_rec_movies = set()
    
    # varables for popularity
    popular_sum = 0
    
    for i, user in enumerate(trainset):
        test_movies = testset.get(user, {})
        rec_movies = recommend(trainset, user, N)  # type:list
        for movie in rec_movies:
            if movie in test_movies:
                hit += 1
            popular_sum += math.log(1 + movie_popular[movie])
            # log steps and times.
        rec_count += N
        test_count += len(test_movies)
        
    precision = hit / (1.0 * rec_count)
    recall = hit / (1.0 * test_count)
    coverage = len(all_rec_movies) / (1.0 * movie_count)
    popularity = popular_sum / (1.0 * rec_count)
    
    print('\nprecision = {}\nrecall = {}\ncoverage = {}\npopularity = {}\n'.format(precision, recall, coverage, popularity))

In [10]:
n_rec_movie = 10
user = 666

movie_popular, movie_count, movie_popular_sort = create_model(train, n_rec_movie)
print("recommend for userid = %s:" % user)
print(recommend(train, user, n_rec_movie))
score_test(train, test, n_rec_movie)

total movie number = 8982
recommend for userid = 666:
[356, 318, 296, 2571, 593, 260, 480, 110, 589, 527]

precision = 0.15777414075286417
recall = 0.04766378244746601
coverage = 0.0
popularity = 5.265997565747565

