1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах
* Средние оценки (+ median, variance, etc.) пользователя и фильма
5. Оценить RMSE на тестовой выборке

In [143]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [144]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [145]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [146]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [147]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [148]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [149]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [150]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [151]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [152]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [153]:
user_mean = ratings.groupby(ratings.userId)['rating'].mean().rename('user_mean')
movie_mean = ratings.groupby(ratings.movieId)['rating'].mean().rename('movie_mean')

In [154]:
user_median = ratings.groupby(ratings.userId)['rating'].median().rename('user_median')
movie_median = ratings.groupby(ratings.movieId)['rating'].median().rename('movie_median')

In [155]:
user_variance = ratings.groupby(ratings.userId)['rating'].std().rename('user_variance')
movie_variance = ratings.groupby(ratings.movieId)['rating'].std().rename('movie_variance')

In [156]:
user_count = ratings.groupby(ratings.userId)['rating'].count().rename('user_count')
movie_count = ratings.groupby(ratings.movieId)['rating'].count().rename('movie_count')

In [157]:
rating_plus = ratings.merge(user_mean, on='userId').merge(movie_mean, on='movieId').merge(
                            user_median, on='userId').merge(movie_median, on='movieId').merge(
                            user_variance, on='userId').merge(movie_variance, on='movieId').merge(
                            user_count, on='userId').merge(movie_count, on='movieId')
rating_plus

Unnamed: 0,userId,movieId,rating,timestamp,user_mean,movie_mean,user_median,movie_median,user_variance,movie_variance,user_count,movie_count
0,1,1,4.0,964982703,4.366379,3.92093,5.00,4.0,0.800048,0.834859,232,215
1,5,1,4.0,847434962,3.636364,3.92093,4.00,4.0,0.990441,0.834859,44,215
2,7,1,4.5,1106635946,3.230263,3.92093,3.50,4.0,1.329594,0.834859,152,215
3,15,1,2.5,1510577970,3.448148,3.92093,3.50,4.0,1.133404,0.834859,135,215
4,17,1,4.5,1305696483,4.209524,3.92093,4.00,4.0,0.508490,0.834859,105,215
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,306,175199,4.0,1518380703,3.316964,4.00000,3.25,4.0,0.729048,,112,1
100832,306,183295,3.5,1518327334,3.316964,3.50000,3.25,3.5,0.729048,,112,1
100833,578,6751,2.5,1300990921,3.962963,2.50000,4.50,2.5,1.117397,,27,1
100834,578,56389,4.0,1300996756,3.962963,4.00000,4.50,4.0,1.117397,,27,1


In [158]:
rating_plus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   userId          100836 non-null  int64  
 1   movieId         100836 non-null  int64  
 2   rating          100836 non-null  float64
 3   timestamp       100836 non-null  int64  
 4   user_mean       100836 non-null  float64
 5   movie_mean      100836 non-null  float64
 6   user_median     100836 non-null  float64
 7   movie_median    100836 non-null  float64
 8   user_variance   100836 non-null  float64
 9   movie_variance  97390 non-null   float64
 10  user_count      100836 non-null  int64  
 11  movie_count     100836 non-null  int64  
dtypes: float64(7), int64(5)
memory usage: 10.0 MB


In [159]:
rating_plus.fillna(0, inplace=True)
rating_plus

Unnamed: 0,userId,movieId,rating,timestamp,user_mean,movie_mean,user_median,movie_median,user_variance,movie_variance,user_count,movie_count
0,1,1,4.0,964982703,4.366379,3.92093,5.00,4.0,0.800048,0.834859,232,215
1,5,1,4.0,847434962,3.636364,3.92093,4.00,4.0,0.990441,0.834859,44,215
2,7,1,4.5,1106635946,3.230263,3.92093,3.50,4.0,1.329594,0.834859,152,215
3,15,1,2.5,1510577970,3.448148,3.92093,3.50,4.0,1.133404,0.834859,135,215
4,17,1,4.5,1305696483,4.209524,3.92093,4.00,4.0,0.508490,0.834859,105,215
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,306,175199,4.0,1518380703,3.316964,4.00000,3.25,4.0,0.729048,0.000000,112,1
100832,306,183295,3.5,1518327334,3.316964,3.50000,3.25,3.5,0.729048,0.000000,112,1
100833,578,6751,2.5,1300990921,3.962963,2.50000,4.50,2.5,1.117397,0.000000,27,1
100834,578,56389,4.0,1300996756,3.962963,4.00000,4.50,4.0,1.117397,0.000000,27,1


In [160]:
def change_string(s):
    return ' '.join(s.replace('-', '').replace('(no genres listed)', '').split('|'))

In [161]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [162]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(movie_genres)
for i in X_counts[:10]:
    print(i)
    print('-' * 40)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 8)	1
----------------------------------------
  (0, 1)	1
  (0, 3)	1
  (0, 8)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
  (0, 7)	1
----------------------------------------
  (0, 4)	1
----------------------------------------
  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
----------------------------------------
  (0, 1)	1
  (0, 3)	1
----------------------------------------
  (0, 0)	1
----------------------------------------
  (0, 1)	1
  (0, 0)	1
  (0, 16)	1
----------------------------------------


In [163]:
count_vect.get_feature_names_out()

array(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'imax',
       'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war',
       'western'], dtype=object)

In [164]:
columns = sorted(count_vect.vocabulary_, key = lambda x: count_vect.vocabulary_[x])

In [165]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [166]:
for i in X_tfidf[:10]:
    print(i)
    print('-' * 40)

  (0, 8)	0.482990142708577
  (0, 4)	0.26758647689140014
  (0, 3)	0.5048454681396087
  (0, 2)	0.5162254711770092
  (0, 1)	0.41684567364693936
----------------------------------------
  (0, 8)	0.5936619434123594
  (0, 3)	0.620525172745643
  (0, 1)	0.5123612074824268
----------------------------------------
  (0, 14)	0.8210088907493954
  (0, 4)	0.5709154064399099
----------------------------------------
  (0, 14)	0.726240982959826
  (0, 7)	0.46640480307738325
  (0, 4)	0.5050154397005037
----------------------------------------
  (0, 4)	1.0
----------------------------------------
  (0, 16)	0.5420423542868653
  (0, 5)	0.6359470441562756
  (0, 0)	0.5493281743985542
----------------------------------------
  (0, 14)	0.8210088907493954
  (0, 4)	0.5709154064399099
----------------------------------------
  (0, 3)	0.7711121633813997
  (0, 1)	0.6366993258087036
----------------------------------------
  (0, 0)	1.0
----------------------------------------
  (0, 16)	0.5457299419583337
  (0, 1)	0.6

In [167]:
X_tfidf

<9742x19 sparse matrix of type '<class 'numpy.float64'>'
	with 22050 stored elements in Compressed Sparse Row format>

In [168]:
tfidf_transformer.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18'],
      dtype=object)

In [170]:
new_x = pd.DataFrame(data=X_tfidf.toarray(),columns=columns)
new_x

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [172]:
movies_genres = pd.concat([movies['movieId'], new_x], axis=1)
movies_genres

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,1,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [175]:
tags['tag'].unique()

array(['funny', 'Highly quotable', 'will ferrell', ..., 'gun fu',
       'heroic bloodshed', 'Heroic Bloodshed'], dtype=object)