Использовать dataset MovieLens

Построить рекомендации (регрессия, предсказываем оценку) на фичах:

TF-IDF на тегах и жанрах

Средние оценки (+ median, variance, etc.) пользователя и фильма

Оценить RMSE на тестовой выборке


In [43]:
import pandas as pd
import numpy as np
import scipy

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

In [45]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [46]:
links.head(1)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [47]:
movies.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [48]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703


In [49]:
tags.head(1)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994


In [50]:
# добавим к рейтингам инфо по фильмам
movies_with_rating = ratings.join(movies.set_index('movieId'), on='movieId')

In [51]:
# отберем только фильмы рейтингом, по которым есть теги
movies_with_tags = tags.movieId.unique()
movies_with_rating_tags = movies_with_rating[movies_with_rating.movieId.isin(movies_with_tags)]

In [52]:
# создадим агрегированные фичи
movies_agg = movies_with_rating_tags.groupby(by='movieId').agg(['mean', 'count', 'var']).rating.reset_index()

In [76]:
movies_agg[movies_agg['var']==0] 

Unnamed: 0,movieId,mean,count,var
32,96,3.5,1,0.0
71,279,3.0,2,0.0
97,363,4.0,2,0.0
117,488,3.0,1,0.0
154,638,3.0,1,0.0
...,...,...,...,...
1532,156605,4.5,1,0.0
1534,158966,4.5,5,0.0
1541,170945,3.5,1,0.0
1549,183611,4.0,1,0.0


In [54]:
# var при 1 оценки NaN, заменим на 0
movies_agg.fillna(0, inplace = True)

In [77]:
movies_with_rating_agg = movies_agg.merge(movies, on='movieId', how='left')

In [78]:
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy|Romance
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy|Romance


In [79]:
grouped_tags = tags.groupby(by='movieId')

film_tags = {}
for key, value in grouped_tags.groups.items():
    film_tags[key] = tags.loc[value.values].tag.tolist()

In [80]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: film_tags[x.movieId], axis=1)

In [81]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [82]:
movies_with_rating_agg['genres'] = [change_string(g) for g in movies_with_rating_agg.genres.values]

In [83]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: ' '.join(x.tags), axis=1)

In [85]:
# соберем вместе жанры и теги
movies_with_rating_agg['genres_tags'] = movies_with_rating_agg.apply(lambda x: x.genres + ' ' + x.tags, axis=1)

In [86]:
# уберем лишнее
movies_with_rating_agg.drop(columns=['genres', 'tags'], inplace=True)

In [87]:
genres_tags = movies_with_rating_agg.genres_tags.tolist()

In [88]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(genres_tags)

In [89]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [90]:
X_tfidf = X_tfidf.toarray()

In [91]:
df_X_tfidf = pd.DataFrame(X_tfidf, index=movies_with_rating_agg.movieId)

In [92]:
df = movies_with_rating_agg.merge(df_X_tfidf, on='movieId')

In [93]:
df

Unnamed: 0,movieId,mean,count,var,title,genres_tags,0,1,2,3,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
0,1,3.920930,215,0.696990,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure Children Fantasy fantasy magic board...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy pregnancy remake,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,183611,4.000000,1,0.000000,Game Night (2018),Action Comedy Crime Horror Comedy funny Rachel...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1550,184471,2.500000,4,1.833333,Tomb Raider (2018),Action Adventure Fantasy adventure Alicia Vika...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1551,187593,3.875000,12,1.505682,Deadpool 2 (2018),Action Comedy SciFi Josh Brolin Ryan Reynolds ...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1552,187595,3.900000,5,0.550000,Solo: A Star Wars Story (2018),Action Adventure Children SciFi Emilia Clarke ...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
# отберем TOP 10 юзеров, которые оценивали фильмы с тегами
ratings[ratings['movieId'].isin(movies_with_tags)].groupby('userId').count()['movieId'].sort_values()[-10:]

userId
480     399
274     413
387     421
288     422
606     447
448     448
68      501
599     685
414     827
474    1198
Name: movieId, dtype: int64

In [100]:
# построим датасет для юзера 606
user606_ratings = ratings[(ratings.userId == 606) & ratings.movieId.isin(movies_with_tags)]

In [101]:
df = df.set_index('movieId')

In [102]:
user606_ratings = user606_ratings.join(df, on = 'movieId')

In [103]:
user606_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,count,var,title,genres_tags,0,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
97364,606,1,2.5,1349082950,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97365,606,7,2.5,1171754710,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97366,606,11,2.5,1174349629,3.671429,70,0.810766,"American President, The (1995)",Comedy Drama Romance politics president,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97368,606,17,4.0,1171838026,3.776119,67,1.312754,Sense and Sensibility (1995),Drama Romance Jane Austen,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97371,606,28,3.5,1173049970,4.227273,11,0.618182,Persuasion (1995),Drama Romance In Netflix queue Jane Austen,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
X = user606_ratings.drop(columns=['userId', 'rating', 'timestamp', 'title','genres_tags']).set_index('movieId')
y = user606_ratings.loc[:, user606_ratings.columns.isin(['movieId', 'rating'])].set_index('movieId')

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [106]:
model = Lasso().fit(X_train, y_train)

In [107]:
y_test_predict = model.predict(X_test)

In [108]:
# посчитаем RMSE для модели
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_test, y_test_predict))

print('RMSE: ', rmse)

RMSE:  0.6065301363459219


In [113]:
# отберем фильмы, которые юзер 606 не оценивал
df_for_reco = df.iloc[~df.index.isin(user606_ratings.movieId.unique())]

In [114]:
df_for_reco.drop(columns=['title','genres_tags'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [115]:
df_for_reco['predicted_score'] = model.predict(df_for_reco)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [117]:
reco_TOP10_for_user = df_for_reco[['mean', 'predicted_score']].sort_values('predicted_score', ascending=False)[:10]

In [118]:
# предсказанные оценки оказались очень близки к друг другу, для финальной сортировки будем использовать среднюю оценку фильма
reco_TOP10_for_user.sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
58559,4.238255,3.853497
608,4.116022,3.866572
79132,4.066434,3.851045
457,3.992105,3.87025
364,3.94186,3.862895
150,3.845771,3.874745
588,3.79235,3.86739
595,3.770548,3.852271
377,3.52924,3.862486
380,3.497191,3.865347


In [120]:
reco_TOP10_for_user.merge(movies.set_index('movieId'), on='movieId').sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
58559,4.238255,3.853497,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
608,4.116022,3.866572,Fargo (1996),Comedy|Crime|Drama|Thriller
79132,4.066434,3.851045,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
457,3.992105,3.87025,"Fugitive, The (1993)",Thriller
364,3.94186,3.862895,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
150,3.845771,3.874745,Apollo 13 (1995),Adventure|Drama|IMAX
588,3.79235,3.86739,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
595,3.770548,3.852271,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
377,3.52924,3.862486,Speed (1994),Action|Romance|Thriller
380,3.497191,3.865347,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
