<center><img src="img/logo_hse_black.jpg"></center>

<h1><center>Методы машинного обучения</center></h1>
<h3><center> Шестаков Андрей (<a href="mailto:avshestakov@hse.ru">avshestakov@hse.ru</a>)</center></h3>
<hr>
<h2><center>Введение в рекомендательные системы</center></h2>


In [30]:
%matplotlib inline

In [31]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

# Подготовка данных

1. Загрузите рейтинги (user_ratedmovies.dat) и описание фильмов (movies.dat)
2. С помощью LabelEncoder перекодируйте идентификаторы фильмов и юзеров в обоих датафреймах


In [32]:
df_ratings = pd.read_csv('./data/user_ratedmovies.dat', sep='\t')

In [33]:
df_movies = pd.read_csv('./data/movies.dat', sep='\t', encoding='latin1')

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
enc_users = LabelEncoder()
enc_movies = LabelEncoder()

In [36]:
enc_users.fit(df_ratings.userID.values)

LabelEncoder()

In [37]:
enc_movies.fit(np.r_[
    df_movies.loc[:, 'id'].values,
    df_ratings.loc[:, 'movieID'].values
])

LabelEncoder()

In [38]:
df_ratings.loc[:, 'userID'] = enc_users.transform(df_ratings.userID.values)
df_ratings.loc[:, 'movieID'] = enc_movies.transform(df_ratings.movieID.values)

In [39]:
df_movies.loc[:, 'id'] = enc_movies.transform(df_movies.loc[:, 'id'].values)

# Сжатое представление фильмов

1. С помощью from scipy.sparse.coo_matrix составьте разреженную матрицу рейтингов
2. С помощью scipy.sparse.linalg.svds получите латентное описание фильмов и пользователей
3. Для каждого фильма найдите 10 ближайших соседей в этих признаках по косинусной мере

In [40]:
from scipy.sparse import coo_matrix

In [43]:
R = coo_matrix((df_ratings.rating.values, 
                (df_ratings.userID.values, df_ratings.movieID.values)), 
              )

In [44]:
R

<2113x10197 sparse matrix of type '<class 'numpy.float64'>'
	with 855598 stored elements in COOrdinate format>

In [45]:
2113*10197

21546261

In [46]:
from scipy.sparse.linalg import svds

In [47]:
u, s, v_t = svds(R, k=20)

In [48]:
u.shape

(2113, 20)

In [49]:
v_t.shape

(20, 10197)

In [50]:
from sklearn.neighbors import NearestNeighbors

In [51]:
v = v_t.T

In [52]:
v.shape

(10197, 20)

In [53]:
nn = NearestNeighbors(n_neighbors=10)

In [54]:
nn.fit(v)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=10, p=2, radius=1.0)

In [55]:
idx = nn.kneighbors(v, n_neighbors=10, return_distance=False)

In [57]:
idx[:10]

array([[   0, 2873,  565,  351,   33, 6010, 4007, 2138, 4576,  572],
       [   1, 1848,  733,  354,  563,  639, 1532, 2078,   18,  149],
       [   2, 3181,  414,    4,  678,  265,  402,  497,  853, 2991],
       [   3,  967, 2052,  611, 1581, 4223,  267,  113, 1697, 4357],
       [   4,  177, 6554, 4988, 1689, 5623,  414,  402, 7118, 1853],
       [   5, 1330, 2062, 2271,  413, 1852,   15,  972, 2770,  455],
       [   6,  225,   10, 3007,  326,  213,  197,  159,  772, 2642],
       [   7, 4440, 6247,  247, 4761, 6184, 3132,  538, 5266,  583],
       [   8, 1444, 1303,  904,  917,  663, 1258, 4310, 4779,  445],
       [   9, 1548, 2842, 1447, 3343,  466,   92, 1246, 5533,  731]])

In [58]:
df_movies = df_movies.sort_values('id')

In [62]:
movie_titles = df_movies.title.values

In [67]:
df_nn_movies = pd.DataFrame(movie_titles[idx], columns=['movie']+
                            ['nn_{}'.format(i) for i in range(1,10)])

In [80]:
movie_name = 'Godfather'
idx = df_nn_movies.movie.str.contains(movie_name, case=False)
df_nn_movies.loc[idx]

Unnamed: 0,movie,nn_1,nn_2,nn_3,nn_4,nn_5,nn_6,nn_7,nn_8,nn_9
787,The Godfather,The Godfather: Part II,Goodfellas,Taxi Driver,Apocalypse Now,One Flew Over the Cuckoo's Nest,Schindler's List,Casablanca,Chinatown,Raging Bull
1109,The Godfather: Part II,The Godfather,Goodfellas,Taxi Driver,Raging Bull,Apocalypse Now,Chinatown,Butch Cassidy and the Sundance Kid,The Godfather: Part III,Casablanca
1817,The Godfather: Part III,Once Upon a Time in America,Casino,Murder on the Orient Express,Saturday Night Fever,The Specialist,In the Name of the Father,The Deer Hunter,8MM,Rebel Without a Cause
7593,Tokyo Godfathers,Perfect Blue,Sennen joyû,Neko no ongaeshi,Six-String Samurai,Jin-Rô,Fong Sai Yuk,Dai-bosatsu tôge,The Mists of Avalon,Dreams
8048,The Godfather,Snitch,Kinsey,"Follow Me, Boys!",4 for Texas,"Walker, Texas Ranger",Sword of the Valiant: The Legend of Sir Gawain...,The Loves of Carmen,Arabian Nights,The Trojan Women


# User-based CF

In [None]:
filepath = './data/user_ratedmovies_train.dat'
idx = df_rates.datetime < q
df_rates.loc[idx].to_csv(filepath, sep='\t', columns=['userID', 'movieID', 'rating'], index=None)

filepath = './data/user_ratedmovies_test.dat'
df_rates.loc[~idx].to_csv(filepath, sep='\t', columns=['userID', 'movieID', 'rating'], index=None)

In [None]:
!pip install surprise

In [None]:
from surprise import Dataset, Reader

In [None]:
filepaths = [('./data/user_ratedmovies_train.dat', './data/user_ratedmovies_test.dat')]
reader = Reader(line_format='user item rating', sep='\t', skip_lines=1)
data = Dataset.load_from_folds(filepaths, reader=reader)

In [None]:
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.accuracy import rmse
from surprise import dump

Описание алгоритмов, основанных на CF - [туть](http://surprise.readthedocs.io/en/stable/knn_inspired.html)

In [None]:
sim_options = {'name': 'pearson',
               'user_based': True,
               'min_support': 5,
               }

In [None]:
dumpfile = './alg.dump'

In [None]:
algo = KNNWithMeans(k=50, min_k=5, sim_options=sim_options)                                                       

for trainset, testset in data.folds(): 
    algo.train(trainset)                             
    predictions = algo.test(testset)
    rmse(predictions)
    
    dump.dump(dumpfile, predictions, algo)

In [None]:
df_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details']) 

In [None]:
df_predictions.head()

In [None]:
algo.predict('1198', '5732')

In [None]:
anti_train = trainset.build_anti_testset()

In [None]:
user_id = '1897'
one_user = filter(lambda r: r[0] == user_id, anti_train)

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
anti_train_predictions = algo.test(one_user, verbose=0)

In [None]:
top_n = get_top_n(anti_train_predictions, n=10)

for uid, user_ratings in top_n.items():
    for (iid, _) in user_ratings:
        print(df_movies.loc[iid, 'title'])

In [None]:
top_n