# Домашняя работа
## Тема: Рекомендации на основе содержания

1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:    
    + TF-IDF на тегах и жанрах    
    + Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt

%matplotlib inline

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

**Добавляем информацию по фильмам**

In [3]:
movies_with_rating = ratings.join(movies.set_index('movieId'), on='movieId')

**Берем только те фильмы у которых есть теги**

In [4]:
movies_with_tags = tags.movieId.unique()
movies_with_rating_tags = movies_with_rating[movies_with_rating.movieId.isin(movies_with_tags)]

**Фичи**

In [5]:
movies_agg = movies_with_rating_tags.groupby(by='movieId').agg(['mean', 'count', 'var']).rating.reset_index()

In [6]:
movies_agg.head()

Unnamed: 0,movieId,mean,count,var
0,1,3.92093,215,0.69699
1,2,3.431818,110,0.777419
2,3,3.259615,52,1.112651
3,5,3.071429,49,0.822917
4,7,3.185185,54,0.955625


In [7]:
movies_agg.fillna(0, inplace = True)

In [8]:
movies_with_rating_agg = movies_agg.merge(movies, on='movieId', how='left')

In [9]:
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy|Romance
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy|Romance


In [10]:
grouped_tags = tags.groupby(by='movieId')

film_tags = {}
for key, value in grouped_tags.groups.items():
    film_tags[key] = tags.loc[value.values].tag.tolist()

In [11]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: film_tags[x.movieId], axis=1)

In [12]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [13]:
movies_with_rating_agg['genres'] = [change_string(g) for g in movies_with_rating_agg.genres.values]

In [14]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: ' '.join(x.tags), axis=1)

**Теги и жанры вместе**

In [15]:
movies_with_rating_agg['genres_tags'] = movies_with_rating_agg.apply(lambda x: x.genres + ' ' + x.tags, axis=1)

In [16]:
movies_with_rating_agg.drop(columns=['genres', 'tags'], inplace=True)

In [17]:
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres_tags
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure Children Fantasy fantasy magic board...
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy pregnancy remake
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake


In [18]:
genres_tags = movies_with_rating_agg.genres_tags.tolist()

In [19]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(genres_tags)

In [20]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [21]:
X_tfidf = X_tfidf.toarray()

In [22]:
df_X_tfidf = pd.DataFrame(X_tfidf, index=movies_with_rating_agg.movieId)

In [23]:
df = movies_with_rating_agg.merge(df_X_tfidf, on='movieId')

**Рекомендации для пользовтелей**

Сделаем рекомендации для самого первого пользователя

In [24]:
first_user_ratings = ratings[(ratings.userId == 1) & ratings.movieId.isin(movies_with_tags)]

In [25]:
df = df.set_index('movieId')

In [26]:
first_user_ratings = first_user_ratings.join(df, on = 'movieId')

In [27]:
first_user_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,count,var,title,genres_tags,0,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
0,1,1,4.0,964982703,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,964981247,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,964983815,3.975369,203,0.850875,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twist ending serial k...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,964982931,4.237745,204,0.641475,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thril...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,101,5.0,964980868,3.782609,23,1.086957,Bottle Rocket (1996),Adventure Comedy Crime Romance crime off-beat ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
X = first_user_ratings.drop(columns=['userId', 'rating', 'timestamp', 'title','genres_tags']).set_index('movieId')
y = first_user_ratings.loc[:, first_user_ratings.columns.isin(['movieId', 'rating'])].set_index('movieId')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [30]:
model = Lasso().fit(X_train, y_train)

In [31]:
y_test_predict = model.predict(X_test)

**Оцениваем RMSE**

In [32]:
rmse = sqrt(mean_squared_error(y_test, y_test_predict))
print('RMSE: ', rmse)

RMSE:  0.7313830078979006


**Убираем фильмы, которые не оценивал пользователь**

In [33]:
df_for_reco = df.iloc[~df.index.isin(first_user_ratings.movieId.unique())]

In [34]:
df_for_reco.drop(columns=['title','genres_tags'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [35]:
df_for_reco['predicted_score'] = model.predict(df_for_reco)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
reco_TOP10_for_user = df_for_reco[['mean', 'predicted_score']].sort_values('predicted_score', ascending=False)[:10]

**Предсказанные оценки**

In [37]:
reco_TOP10_for_user.sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,4.429022,4.482077
858,4.289062,4.437007
7153,4.118919,4.434483
4993,4.106061,4.43917
5952,4.021277,4.435565
589,3.970982,4.448545
2762,3.893855,4.43232
150,3.845771,4.440252
588,3.79235,4.433762
380,3.497191,4.431959


**Сортируем по средней оценке фильма**

In [38]:
reco_TOP10_for_user.merge(movies.set_index('movieId'), on='movieId').sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,4.429022,4.482077,"Shawshank Redemption, The (1994)",Crime|Drama
858,4.289062,4.437007,"Godfather, The (1972)",Crime|Drama
7153,4.118919,4.434483,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
4993,4.106061,4.43917,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
5952,4.021277,4.435565,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
589,3.970982,4.448545,Terminator 2: Judgment Day (1991),Action|Sci-Fi
2762,3.893855,4.43232,"Sixth Sense, The (1999)",Drama|Horror|Mystery
150,3.845771,4.440252,Apollo 13 (1995),Adventure|Drama|IMAX
588,3.79235,4.433762,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
380,3.497191,4.431959,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
