In [1]:
import io
import pandas as pd
from collections import defaultdict
from surprise import Dataset
from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNWithMeans
from surprise import get_dataset_dir
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [2]:
# Чтение данных из подготовленных файлов и объявление констант.
n = 5
threshold = 3.52
user = '10'
measure = ['RMSE']
verbose = True
test_rmse = 'test_rmse'

data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)

In [3]:
# Define algorithms
k = 30
algos = {
    'NP': NormalPredictor(),
    'KNN_cos': KNNWithMeans(k = k, sim_options = { 'name': 'cosine' }),
    'KNN_MSD': KNNWithMeans(k = k),
    'KNN_pearson': KNNWithMeans(k = k, sim_options = { 'name': 'pearson' }),
    'svd' : SVD()
}

In [4]:
# Cross validation
crv = {}
for algo in algos:
    crv[algo] = cross_validate(algos[algo], data, measures = measure, verbose = verbose)

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5214  1.5200  1.5198  1.5218  1.5093  1.5184  0.0046  
Fit time          0.15    0.15    0.15    0.16    0.15    0.15    0.00    
Test time         0.24    0.16    0.20    0.20    0.16    0.19    0.03    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9569  0.9505  0.9598  0.9686  0.9552  0.9582  0.0060  
Fit time          0.87    0.88    0.88    0.91    1

In [5]:
RSMA = {}
for cv in crv:
    RSMA[cv] = round(crv[cv][test_rmse].mean(), 3)
RSMA

{'NP': 1.518,
 'KNN_cos': 0.958,
 'KNN_MSD': 0.952,
 'KNN_pearson': 0.952,
 'svd': 0.937}

In [6]:
# Best algorithm
best_algorithm_name = min(RSMA.items(), key=lambda x: x[1])[0]
best_algorithm_name

'svd'

In [7]:
best_algorithm = algos[best_algorithm_name]

In [8]:
# Calculate precision@k and recall@k
def precision_recall_at_k(predictions, k = 10, threshold = 3.5):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    return precisions, recalls

In [9]:
best_algorithm.fit(trainset)
predictions = best_algorithm.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k = 5, threshold = threshold)
precisionak = round(sum(precision for precision in precisions.values()) / len(precisions), 3)
recallak = round(sum(recall for recall in recalls.values()) / len(recalls), 3)

In [10]:
precisionak

0.739

In [11]:
recallak

0.361

In [12]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [13]:
def num_of_top_n(top_n, user):
    for uid, user_ratings in top_n.items():
        if uid == user: 
            films = [iid for (iid, _) in user_ratings]
            rate = user_ratings
            break
    return films, rate

In [14]:
testset = trainset.build_anti_testset()
predictions = best_algorithm.test(testset)
top_n = get_top_n(predictions, n)
films, rate = num_of_top_n(top_n, user)
rate=dict(rate)

films

['318', '483', '199', '408', '357']

In [15]:
def info(films):
    data_path = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    info = {}
    data = pd.read_csv(data_path, sep='|',encoding='ISO-8859-1', header = None) 
    for film in films:
        row = data.iloc[int(film)]
        info[film] = (row[1], row[2])
    return info

In [16]:
films_info = info(films)
films_info

{'318': ('Everyone Says I Love You (1996)', '06-Dec-1996'),
 '483': ('Maltese Falcon, The (1941)', '01-Jan-1941'),
 '199': ('Shining, The (1980)', '01-Jan-1980'),
 '408': ('Jack (1996)', '07-Aug-1996'),
 '357': ('Spawn (1997)', '01-Aug-1997')}

In [17]:
file = open('2_2.txt', 'w')
file.write('User '+ user + '\n')
for film in films_info:
    file.write(str(film) + ' ' + str(films_info[film]) + ' ' + str(round(rate[film],3)) + '\n')
file.close()

In [18]:
print('User '+ user)
for film in films_info:
    print(str(film) + ' ' + str(films_info[film]) + ' ' + str(round(rate[film],3)))

User 10
318 ('Everyone Says I Love You (1996)', '06-Dec-1996') 5
483 ('Maltese Falcon, The (1941)', '01-Jan-1941') 4.904
199 ('Shining, The (1980)', '01-Jan-1980') 4.826
408 ('Jack (1996)', '07-Aug-1996') 4.783
357 ('Spawn (1997)', '01-Aug-1997') 4.771
