### Домашнее задание по теме «Рекомендации на основе содержания»


- Использовать dataset MovieLens
- Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
- Оценить RMSE на тестовой выборке

In [47]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [28]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


Подготовим общий датасет с тегами и жанрами

In [36]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId').reset_index()[['movieId', 'title', 'genres', 'tag']]
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,magic board game


In [37]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [38]:
movies_with_tags.loc[:, 'genres'] = movies_with_tags['genres'].apply(change_string)
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,magic board game


In [52]:
tfidf_vectorizer_g = TfidfVectorizer()
genres_tfidf = tfidf_vectorizer_g.fit_transform(movies_with_tags['genres'])

In [54]:
genres_tfidf.toarray().shape

(11853, 20)

In [55]:
movies_with_tags.shape

(11853, 4)

In [185]:
voc = tfidf_vectorizer_g.vocabulary_
col_names = list({k: v for k, v in sorted(voc.items(), key=lambda item: item[1])}.keys()) 
# добавим метку _g, чтобы в дальнейшем названия колонок не пересекались с колонками, полученными из тегов
col_names = [i+'_g' for i in col_names] 
col_names

['action_g',
 'adventure_g',
 'animation_g',
 'children_g',
 'comedy_g',
 'crime_g',
 'documentary_g',
 'drama_g',
 'fantasy_g',
 'filmnoir_g',
 'horror_g',
 'imax_g',
 'musical_g',
 'mystery_g',
 'nogenreslisted_g',
 'romance_g',
 'scifi_g',
 'thriller_g',
 'war_g',
 'western_g']

In [186]:
df_tfidf_g = pd.DataFrame(genres_tfidf.toarray(), columns=col_names)
df_tfidf_g.head()

Unnamed: 0,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,fantasy_g,filmnoir_g,horror_g,imax_g,musical_g,mystery_g,nogenreslisted_g,romance_g,scifi_g,thriller_g,war_g,western_g
0,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,0.592472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,0.592472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Подготовим данные по тегам

In [79]:
tags_freq = movies_with_tags.groupby('tag')[['movieId']].count().reset_index()
tags_freq.rename(columns={"movieId": "frequency"}, inplace=True)
tags_freq

Unnamed: 0,tag,frequency
0,"""artsy""",1
1,06 Oscar Nominated Best Movie - Animation,3
2,1900s,1
3,1920s,2
4,1950s,2
...,...,...
1584,wry,1
1585,younger men,1
1586,zither,1
1587,zoe kazan,1


In [83]:
sorted(tags_freq.frequency.unique())

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 36,
 131]

Будем считать теги редкими, если их количество во всем датасете не превышает, допустим, 7. Заменим их на тег other 

In [91]:
tags_freq[tags_freq.frequency<=7]

Unnamed: 0,tag,frequency
0,"""artsy""",1
1,06 Oscar Nominated Best Movie - Animation,3
2,1900s,1
3,1920s,2
4,1950s,2
...,...,...
1584,wry,1
1585,younger men,1
1586,zither,1
1587,zoe kazan,1


In [99]:
movies_with_tags[movies_with_tags.tag.isna()]

Unnamed: 0,movieId,title,genres,tag
9,4,Waiting to Exhale (1995),Comedy Drama Romance,
12,6,Heat (1995),Action Crime Thriller,
14,8,Tom and Huck (1995),Adventure Children,
15,9,Sudden Death (1995),Action,
16,10,GoldenEye (1995),Action Adventure Thriller,
...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,
11849,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,
11850,193585,Flint (2017),Drama,
11851,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,


Такие теги составляют существенную долю от всех, но нам и нужно заметно сократить количество уникальных тегов, чтобы в результате преобразования TFIDF не получить супер разреженную матрицу

In [93]:
rare_tags = list(tags_freq[tags_freq.frequency<=7].tag.unique())

In [94]:
movies_with_tags.loc[movies_with_tags['tag'].isin(rare_tags), 'tag'] = 'other'
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
3,2,Jumanji (1995),Adventure Children Fantasy,other
4,2,Jumanji (1995),Adventure Children Fantasy,other


Так же заменим отсутствующие теги записью-меткой "NO TAG HERE", чтобы позднее после склеивания результатов работы tfidf и исхлжного датасета мы не получили множество строк с nan значениями, там где тегов изначально не было (а таких строк достаточно много)

In [174]:
movies_with_tags.loc[movies_with_tags['tag'].isna(), 'tag'] = 'NO_TAG_HERE'
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
3,2,Jumanji (1995),Adventure Children Fantasy,other
4,2,Jumanji (1995),Adventure Children Fantasy,other


In [175]:
movies_with_tags[movies_with_tags['tag'].isna()]

Unnamed: 0,movieId,title,genres,tag


In [176]:
movies_with_tags.tag.unique().shape

(84,)

In [177]:
# обучим на not null тегах
tfidf_vectorizer_t = TfidfVectorizer()
tags_tfidf = tfidf_vectorizer_t.fit_transform(movies_with_tags.tag)

In [178]:
voc_t = tfidf_vectorizer_t.vocabulary_
col_names = list({k: v for k, v in sorted(voc_t.items(), key=lambda item: item[1])}.keys()) 
df_tfidf_t = pd.DataFrame(tags_tfidf.toarray(), columns=col_names)
df_tfidf_t.head()

Unnamed: 0,250,action,adolescence,adultery,adventure,aliens,animation,anime,appealing,arts,...,thought,time,top,travel,twist,vietnam,violence,visually,war,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь можно собрать общий датасет с данными по жанрам и тегам фильмов

In [179]:
print(movies_with_tags.shape, df_tfidf_g.shape, df_tfidf_t.shape) 

(11853, 4) (11853, 20) (11853, 102)


In [224]:
df_all = pd.concat([movies_with_tags[['movieId', 'title']], df_tfidf_g, df_tfidf_t], axis=1)
df_all.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,thought,time,top,travel,twist,vietnam,violence,visually,war,world
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [225]:
df_all.shape

(11853, 124)

Теперь нужно добавить средние оценки (и другие показатели) по фильмам 

In [226]:
movies_avg_ratings = ratings.groupby('movieId').agg({'rating':['mean', 'median', 'min', 'max']})
movies_avg_ratings.columns = ['_'.join(col) for col in movies_avg_ratings.columns.values]
movies_avg_ratings = movies_avg_ratings.reset_index()
movies_avg_ratings.head()

Unnamed: 0,movieId,rating_mean,rating_median,rating_min,rating_max
0,1,3.92093,4.0,0.5,5.0
1,2,3.431818,3.5,0.5,5.0
2,3,3.259615,3.0,0.5,5.0
3,4,2.357143,3.0,1.0,3.0
4,5,3.071429,3.0,0.5,5.0


In [227]:
df_all = df_all.join(movies_avg_ratings.set_index('movieId'), on='movieId')
df_all.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,twist,vietnam,violence,visually,war,world,rating_mean,rating_median,rating_min,rating_max
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0


 В случае с системой рекомендаци на основе содержания для каждого отдельного пользователя строится своя модель. Поэтому чтобы у нас было больше данных для обучения, посмотрим, какие пользователи оставляли больше всего оценок и тегов к фильмам

In [16]:
a = tags.groupby('userId').movieId.count().sort_values(ascending=False)[:10].keys() 

In [17]:
b = ratings.groupby('userId').movieId.count().sort_values(ascending=False)[:10].keys() 

In [18]:
set(a)&set(b)

{474, 599}

In [22]:
tags.groupby('userId').movieId.count().sort_values(ascending=False)[:10]

userId
474    1507
567     432
62      370
599     323
477     280
424     273
537     100
125      48
357      45
318      41
Name: movieId, dtype: int64

In [24]:
ratings.groupby('userId').movieId.count().sort_values(ascending=False)[:10]

userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: movieId, dtype: int64

Возьмем пользователя 474, у него больше всего оставленных тегов и достаточно много оценок.


In [219]:
# отбираем фильмы, которым выставлял оценки юзер 474
movies_474 = ratings[ratings.userId==474][['movieId', 'rating']]
movies_474.head()

Unnamed: 0,movieId,rating
73092,1,4.0
73093,2,3.0
73094,5,1.5
73095,6,3.0
73096,7,3.0


In [228]:
# соединив через inner join получим как раз те фильмы, которые смотрел пользователь 474   
df_train = df_all.join(movies_474.set_index('movieId'), on='movieId', how='inner')
df_train.head(10)

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,vietnam,violence,visually,war,world,rating_mean,rating_median,rating_min,rating_max,rating
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
5,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
6,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
10,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
11,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
12,6,Heat (1995),0.561593,0.0,0.0,0.0,0.0,0.627743,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.946078,4.0,1.0,5.0,3.0


In [229]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3365 entries, 0 to 8568
Columns: 129 entries, movieId to rating
dtypes: float64(127), int64(1), object(1)
memory usage: 3.3+ MB


In [230]:
# уберем дубликаты строк
df_train = df_train.drop_duplicates()
df_train.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,vietnam,violence,visually,war,world,rating_mean,rating_median,rating_min,rating_max,rating
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
10,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
11,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
12,6,Heat (1995),0.561593,0.0,0.0,0.0,0.0,0.627743,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.946078,4.0,1.0,5.0,3.0


In [239]:
X, y = df_train.drop(['movieId', 'title', 'rating'], axis=1), df_train['rating']

In [235]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [257]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=146)

In [258]:
model = LinearRegression()

In [259]:
model.fit(x_train, y_train)

LinearRegression()

In [262]:
y_pred = model.predict(x_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_pred, y_test))}')

RMSE: 0.6263863142717975


In [268]:
print('Train score:', model.score(x_train, y_train))
print('Test score:', model.score(x_test, y_test))

Train score: 0.5173064286092788
Test score: 0.4488540922012586


Как видим, результат даже на трейне не очень высок.

In [265]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe_model = make_pipeline(StandardScaler(),LinearRegression())

In [266]:
pipe_model.fit(x_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [267]:
print('Train score:', pipe_model.score(x_train, y_train))
print('Test score:', pipe_model.score(x_test, y_test))

Train score: 0.5118137009005821
Test score: 0.43937406630038744


Стандартизация не принесла улучшения. Можно попробовать отнести к редким тегам те теги, которые встречаются в датасете большее количество раз. Тогда в финальном наборе данных будет меньше столбцов, и на менее разреженной матрице результат должен быть получше 

In [269]:
tags_freq[tags_freq.frequency<=10]

Unnamed: 0,tag,frequency
0,"""artsy""",1
1,06 Oscar Nominated Best Movie - Animation,3
2,1900s,1
3,1920s,2
4,1950s,2
...,...,...
1584,wry,1
1585,younger men,1
1586,zither,1
1587,zoe kazan,1


In [270]:
rare_tags = list(tags_freq[tags_freq.frequency<=10].tag.unique())

In [271]:
movies_with_tags.loc[movies_with_tags['tag'].isin(rare_tags), 'tag'] = 'other'
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
3,2,Jumanji (1995),Adventure Children Fantasy,other
4,2,Jumanji (1995),Adventure Children Fantasy,other


In [272]:
movies_with_tags.loc[movies_with_tags['tag'].isna(), 'tag'] = 'NO_TAG_HERE'
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,other
3,2,Jumanji (1995),Adventure Children Fantasy,other
4,2,Jumanji (1995),Adventure Children Fantasy,other


In [273]:
movies_with_tags[movies_with_tags['tag'].isna()]

Unnamed: 0,movieId,title,genres,tag


In [274]:
movies_with_tags.tag.unique().shape

(48,)

In [275]:
# обучим на not null тегах
tfidf_vectorizer_t = TfidfVectorizer()
tags_tfidf = tfidf_vectorizer_t.fit_transform(movies_with_tags.tag)

In [276]:
voc_t = tfidf_vectorizer_t.vocabulary_
col_names = list({k: v for k, v in sorted(voc_t.items(), key=lambda item: item[1])}.keys()) 
df_tfidf_t = pd.DataFrame(tags_tfidf.toarray(), columns=col_names)
df_tfidf_t.head()

Unnamed: 0,250,action,adolescence,adultery,aliens,anime,appealing,atmospheric,black,book,...,stephen,superhero,surreal,suspense,thought,time,top,travel,twist,visually
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь можно собрать общий датасет с данными по жанрам и тегам фильмов

In [277]:
print(movies_with_tags.shape, df_tfidf_g.shape, df_tfidf_t.shape) 

(11853, 4) (11853, 20) (11853, 60)


In [278]:
df_all = pd.concat([movies_with_tags[['movieId', 'title']], df_tfidf_g, df_tfidf_t], axis=1)
df_all.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,stephen,superhero,surreal,suspense,thought,time,top,travel,twist,visually
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [279]:
df_all.shape

(11853, 82)

Теперь нужно добавить средние оценки (и другие показатели) по фильмам 

In [280]:
movies_avg_ratings = ratings.groupby('movieId').agg({'rating':['mean', 'median', 'min', 'max']})
movies_avg_ratings.columns = ['_'.join(col) for col in movies_avg_ratings.columns.values]
movies_avg_ratings = movies_avg_ratings.reset_index()
movies_avg_ratings.head()

Unnamed: 0,movieId,rating_mean,rating_median,rating_min,rating_max
0,1,3.92093,4.0,0.5,5.0
1,2,3.431818,3.5,0.5,5.0
2,3,3.259615,3.0,0.5,5.0
3,4,2.357143,3.0,1.0,3.0
4,5,3.071429,3.0,0.5,5.0


In [281]:
df_all = df_all.join(movies_avg_ratings.set_index('movieId'), on='movieId')
df_all.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,thought,time,top,travel,twist,visually,rating_mean,rating_median,rating_min,rating_max
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0


In [282]:
# отбираем фильмы, которым выставлял оценки юзер 474
movies_474 = ratings[ratings.userId==474][['movieId', 'rating']]
movies_474.head()

Unnamed: 0,movieId,rating
73092,1,4.0
73093,2,3.0
73094,5,1.5
73095,6,3.0
73096,7,3.0


In [283]:
# соединив через inner join получим как раз те фильмы, которые смотрел пользователь 474   
df_train = df_all.join(movies_474.set_index('movieId'), on='movieId', how='inner')
df_train.head(10)

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,time,top,travel,twist,visually,rating_mean,rating_median,rating_min,rating_max,rating
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
1,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
2,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
4,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
5,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
6,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
10,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
11,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
12,6,Heat (1995),0.561593,0.0,0.0,0.0,0.0,0.627743,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.946078,4.0,1.0,5.0,3.0


In [284]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3365 entries, 0 to 8568
Data columns (total 87 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movieId           3365 non-null   int64  
 1   title             3365 non-null   object 
 2   action_g          3365 non-null   float64
 3   adventure_g       3365 non-null   float64
 4   animation_g       3365 non-null   float64
 5   children_g        3365 non-null   float64
 6   comedy_g          3365 non-null   float64
 7   crime_g           3365 non-null   float64
 8   documentary_g     3365 non-null   float64
 9   drama_g           3365 non-null   float64
 10  fantasy_g         3365 non-null   float64
 11  filmnoir_g        3365 non-null   float64
 12  horror_g          3365 non-null   float64
 13  imax_g            3365 non-null   float64
 14  musical_g         3365 non-null   float64
 15  mystery_g         3365 non-null   float64
 16  nogenreslisted_g  3365 non-null   float64


In [285]:
# уберем дубликаты строк
df_train = df_train.drop_duplicates()
df_train.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,time,top,travel,twist,visually,rating_mean,rating_median,rating_min,rating_max,rating
0,1,Toy Story (1995),0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.5,5.0,4.0
3,2,Jumanji (1995),0.0,0.498401,0.0,0.632908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.431818,3.5,0.5,5.0,3.0
10,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.071429,3.0,0.5,5.0,1.5
12,6,Heat (1995),0.561593,0.0,0.0,0.0,0.0,0.627743,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.946078,4.0,1.0,5.0,3.0
13,7,Sabrina (1995),0.0,0.0,0.0,0.0,0.573134,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.185185,3.0,1.0,5.0,3.0


In [286]:
X, y = df_train.drop(['movieId', 'title', 'rating'], axis=1), df_train['rating']

In [287]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=146)

In [288]:
model = LinearRegression()

In [289]:
model.fit(x_train, y_train)

LinearRegression()

In [290]:
y_pred = model.predict(x_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_pred, y_test))}')

RMSE: 0.5743221468060559


In [291]:
print('Train score:', model.score(x_train, y_train))
print('Test score:', model.score(x_test, y_test))

Train score: 0.5048273133738266
Test score: 0.4588141835229239


Результат немного улучшился: RMSE уменьшился, score подрос