# Домашняя работа
### Тема: Гибридные рекомендательные системы

In [1]:
!pip install surprise



In [2]:
from surprise import SVD, SVDpp, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [8]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [9]:
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

CPU times: user 2.45 s, sys: 1.61 ms, total: 2.45 s
Wall time: 2.46 s


In [10]:
test_pred = algo.test(testset)

In [11]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8692


0.8692098416091573

In [12]:
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.443750972949918

In [13]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [14]:
scores[:10]

[3.993322623304302,
 3.4960700040534136,
 3.153153481986765,
 3.0001090784253814,
 3.0602218162604826,
 3.921739239986222,
 3.317827146275872,
 3.1358827669198623,
 3.174364436568982,
 3.5582748609497683]

In [15]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [16]:
movie_genres = [change_string(g) for g in movies.genres.values]

Проверка, как будут выводиться жанры

In [17]:
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [18]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [19]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [20]:
movies_with_ratings.sort_values('timestamp', inplace=True)

In [21]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




**NearestNeighbors**

In [23]:
def recommend_for_user(user_id):
    current_user_id = user_id
    df_user = movies_with_ratings.pivot_table(index='userId', columns='title', values='rating', aggfunc='count')
    
    # сосздаём фичи для knn
    df_user.fillna(0, inplace=True)
    neigh = NearestNeighbors(n_neighbors=2)
    neigh.fit(df_user) 
    NearestNeighbors(n_neighbors=2) # берём топ 2 максимально похожим
    num = neigh.kneighbors(df_user[df_user.index==user_id])[1][0][1] # первым будет он сам, поэтому берём второго
    film = df_user[df_user.index==num].T     
    film = film[film[num]!=0]
    movies_to_score = list(film.reset_index()['title']) # и теперь оцениваем только его фильмы

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

Рекомендации для пользователя

In [24]:
recommend_for_user(555)

Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) 4.313364694777221
Forrest Gump (1994) 4.262876200293267
Maltese Falcon, The (1941) 4.236907740891987
Princess Bride, The (1987) 4.222048479899918
Sound of Music, The (1965) 4.221574591412012
Harry Potter and the Prisoner of Azkaban (2004) 4.158600197247589
Donnie Darko (2001) 4.149017127519765
Fight Club (1999) 4.144437461482381
Star Wars: Episode IV - A New Hope (1977) 4.141548014873391
Eternal Sunshine of the Spotless Mind (2004) 4.13482710982901
