<center><img src="images/header.png"></center>

<h1><center>Алгоритмы интеллектуальной обработки больших объемов данных</center></h1>
<hr>
<h2><center>Введение в рекомендательные системы</center></h2>

In [16]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

# Подготовка данных

1. Скачайте [данные](https://cloud.mail.ru/public/CSjR/mPctRVc2u) о рейтингах и фильмах
2. Загрузите рейтинги (user_ratedmovies.dat) и описание фильмов (movies.dat)
3. С помощью LabelEncoder перекодируйте идентификаторы фильмов и юзеров в обоих датафреймах


In [17]:
df_ratings = pd.read_csv('data/user_ratedmovies.dat', sep='\t')

In [18]:
df_ratings.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [19]:
df_ratings.shape

(855598, 9)

In [20]:
df_ratings.userID.min(), df_ratings.userID.max()

(75, 71534)

In [21]:
df_ratings.movieID.min(), df_ratings.movieID.max()

(1, 65133)

In [22]:
df_ratings.userID.nunique(), df_ratings.movieID.nunique()

(2113, 10109)

In [23]:
df_movies = pd.read_csv('data/movies.dat', sep='\t')

In [24]:
df_movies.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gru�ones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora tambi�n abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
enc_user = LabelEncoder()
enc_mov = LabelEncoder()

In [27]:
enc_user.fit(df_ratings.userID.values)
enc_mov.fit(df_ratings.movieID.values)

LabelEncoder()

In [28]:
idx = df_movies.loc[:, 'id'].isin(df_ratings.loc[:, 'movieID'])
df_movies = df_movies.loc[idx, :]

In [29]:
df_ratings.loc[:, 'userID'] = \
            enc_user.transform(df_ratings.loc[:, 'userID'])
    
df_ratings.loc[:, 'movieID'] = \
            enc_mov.transform(df_ratings.loc[:, 'movieID'])
df_movies.loc[:, 'id'] = \
            enc_mov.transform(df_movies.loc[:, 'id'])

In [32]:
from scipy.sparse import coo_matrix

In [33]:
n_movies = df_ratings.movieID.nunique()
n_users = df_ratings.userID.nunique()

In [34]:
R = coo_matrix(
    (df_ratings.rating.values, 
     (df_ratings.userID.values, df_ratings.movieID.values)), 
    shape=(n_users, n_movies))

In [37]:
R.nnz / (1.*n_movies*n_users)

0.040055491685820954

# Сжатое представление фильмов

1. С помощью from scipy.sparse.coo_matrix составьте разреженную матрицу рейтингов
2. С помощью scipy.sparse.linalg.svds получите латентное описание фильмов и пользователей
3. Для каждого фильма найдите 10 ближайших соседей в этих признаках по косинусной мере

In [39]:
from scipy.sparse.linalg import svds

In [40]:
u, s, vt = svds(R, k=10)

In [41]:
u.shape

(2113, 10)

In [42]:
s.shape

(10,)

In [43]:
vt.shape

(10, 10109)

In [44]:
V = vt.T

In [45]:
V.shape

(10109, 10)

In [46]:
from sklearn.neighbors import NearestNeighbors

In [47]:
nn = NearestNeighbors()
nn.fit(V)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [48]:
ind = nn.kneighbors(V, n_neighbors=11, return_distance=False)

In [50]:
ind.shape

(10109, 11)

In [49]:
ind

array([[    0,   565,  2858, ...,  3983,  2394,  4998],
       [    1,  1839,   149, ...,   144,   690,   354],
       [    2,     4,   414, ...,   929,   505,  1899],
       ...,
       [10106,   775,  5181, ...,  3859,  9497,  8552],
       [10107, 10083,  5051, ...,  9495,  7486,  5521],
       [10108,  2683,  7447, ...,  3246,  8338,  6491]])

In [52]:
df_movie_title = df_movies.loc[:, ['id', 'title']].set_index('id')

In [53]:
df_movie_title.head()

Unnamed: 0_level_0,title
id,Unnamed: 1_level_1
0,Toy story
1,Jumanji
2,Grumpy Old Men
3,Waiting to Exhale
4,Father of the Bride Part II


In [55]:
titles = df_movie_title.title.values

In [58]:
df_nn_titles = pd.DataFrame(titles[ind])

In [60]:
df_nn_titles.columns = \
['title'] + ['nn_{}'.format(i) for i in range(1,11)]

In [65]:
idx = df_movies.title.str.contains(u'Dark Knight')
df_movies.loc[idx, 'imdbPictureURL']

9811    http://ia.media-imdb.com/images/M/MV5BMTMxNTMw...
Name: imdbPictureURL, dtype: object

In [70]:
idx = df_nn_titles.title.str.contains(u'Pulp')
df_nn_titles.loc[idx]

Unnamed: 0,title,nn_1,nn_2,nn_3,nn_4,nn_5,nn_6,nn_7,nn_8,nn_9,nn_10
285,Pulp Fiction,Fight Club,Memento,American Beauty,Shichinin no samurai,Reservoir Dogs,Kill Bill: Vol. 2,Kill Bill: Vol. 2,The Silence of the Lambs,The Usual Suspects,The Shawshank Redemption


# User-based CF

* Разбейте данные на обучение и контроль в пропорции 80/20
* Реализуйте функцию расчета попарных схожестей между пользователями
* Реализуйте метод расчета рекомендаций на основе $K$ наиболее похожих пользователей. Постройте график зафисимости ошибки MAE от $K$ (5-25)
* Выполните нормализацию рейтингов с помощью вычитания средней оценки $\bar{R}_u$ каждого из пользователей и повторите предыдущий 2 шага. В этом случае предсказание выполняется следующим образом
$$ \hat{R}_{ui} = \bar{R}_u + \frac{\sum_{v \in N(u)} s_{uv} (R_{vi} - \bar{R}_v)}{\sum_{v \in N(u)}| s_{uv}|} $$
* Перейдите к Item-Based подходу и повторите предыдущие шаги

In [72]:
from scipy.spatial.distance import cosine
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.spatial.distance import correlation
from sklearn.metrics import pairwise_distances

In [75]:
df_ratings = df_ratings.rename(columns = 
                               {'date_day': 'day',
                                'date_month': 'month',
                                'date_year': 'year'})

In [78]:
df_ratings.loc[:, 'datetime'] = pd.to_datetime(df_ratings.loc[:, ['year', 'month', 'day']])

In [79]:
df_ratings.head()

Unnamed: 0,userID,movieID,rating,day,month,year,date_hour,date_minute,date_second,datetime
0,0,2,1.0,29,10,2006,23,17,16,2006-10-29
1,0,31,4.5,29,10,2006,23,23,44,2006-10-29
2,0,105,4.0,29,10,2006,23,30,8,2006-10-29
3,0,151,2.0,29,10,2006,23,16,52,2006-10-29
4,0,154,4.0,29,10,2006,23,29,30,2006-10-29


In [82]:
df_ratings_train = df_ratings[df_ratings.datetime < "2007-10-25"]
df_ratings_test = df_ratings[df_ratings.datetime >= "2007-10-25"]

In [83]:
df_ratings_train.shape

(684056, 10)

In [84]:
df_ratings_test.shape

(171542, 10)

In [108]:
def similarity(u, v):
    idx = (u != 0) & (v != 0)
    if any(idx):
        return -cosine(u[idx], v[idx])+2
    else:
        return 0

In [116]:
R_train = coo_matrix(
    (df_ratings_train.rating.values, 
     (df_ratings_train.userID.values, df_ratings_train.movieID.values)), 
    shape=(n_users, n_movies))

In [110]:
%%time
S = pdist(R_train.toarray(), metric=similarity)

CPU times: user 5min 18s, sys: 3 s, total: 5min 21s
Wall time: 5min 30s


In [111]:
Sim = squareform(S)

In [115]:
df_ratings_test.head()

Unnamed: 0,userID,movieID,rating,day,month,year,date_hour,date_minute,date_second,datetime
925,5,164,1.0,30,7,2008,18,59,11,2008-07-30
936,5,354,3.0,30,7,2008,18,33,3,2008-07-30
937,5,355,3.0,30,7,2008,18,38,38,2008-07-30
938,5,367,2.5,27,8,2008,4,26,37,2008-08-27
939,5,461,4.5,16,4,2008,19,58,20,2008-04-16


In [117]:
R_train = R_train.toarray()

In [119]:
predicted_ratings = []

for i, r in tqdm_notebook(df_ratings_test.iterrows()):
    watched_users = np.where(R_train[:, int(r['movieID'])])[0]
    sim = Sim[int(r['userID']), watched_users]
    sorted_idx = np.argsort(sim)
    for k in range(1, 11):
        
        ratings = R_train[watched_users[sorted_idx[-k:]], int(r['movieID'])]
        
        sim_k = sim[sorted_idx[-k:]]
        
        prediction = ratings.dot(sim_k)/(sim_k.sum()) 

        predicted_ratings.append({'userID': r['userID'],
                                  'movieID': r['movieID'],
                                  'prediction': prediction,
                                  'k': k})






In [120]:
df_predicted_ratings = pd.DataFrame(predicted_ratings)

In [121]:
df_predicted_ratings.head()

Unnamed: 0,k,movieID,prediction,userID
0,1,164,1.5,5
1,2,164,2.244195,5
2,3,164,2.328959,5
3,4,164,2.495809,5
4,5,164,2.596115,5


In [123]:
df_ratings_test.head()

Unnamed: 0,userID,movieID,rating,day,month,year,date_hour,date_minute,date_second,datetime
925,5,164,1.0,30,7,2008,18,59,11,2008-07-30
936,5,354,3.0,30,7,2008,18,33,3,2008-07-30
937,5,355,3.0,30,7,2008,18,38,38,2008-07-30
938,5,367,2.5,27,8,2008,4,26,37,2008-08-27
939,5,461,4.5,16,4,2008,19,58,20,2008-04-16


In [124]:
df_predicted_ratings = \
    df_predicted_ratings.join(df_ratings_test.loc[:, ['movieID', 'userID', 'rating']].set_index(['movieID', 'userID']),
                             on=['movieID', 'userID'])

In [126]:
df_predicted_ratings.loc[:, 'error'] = abs(df_predicted_ratings.prediction - df_predicted_ratings.rating)

In [129]:
df_predicted_ratings.groupby('k').error.mean()

k
1     0.774269
2     0.696769
3     0.669107
4     0.655702
5     0.647875
6     0.642787
7     0.639143
8     0.636306
9     0.634577
10    0.633103
Name: error, dtype: float64