1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах
* Средние оценки (+ median, variance, etc.) пользователя и фильма
5. Оценить RMSE на тестовой выборке

In [88]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [9]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [10]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


Сначала получаю статистические показатели (среднее значение, медиану, стандартное отклонение, количество оценок) по пользователям и фильмам с помощью таблицы ratings. Все полученные данные добавляю к таблице ratings

In [11]:
user_mean = ratings.groupby(ratings.userId)['rating'].mean().rename('user_mean')
movie_mean = ratings.groupby(ratings.movieId)['rating'].mean().rename('movie_mean')

In [12]:
user_median = ratings.groupby(ratings.userId)['rating'].median().rename('user_median')
movie_median = ratings.groupby(ratings.movieId)['rating'].median().rename('movie_median')

In [13]:
user_variance = ratings.groupby(ratings.userId)['rating'].std().rename('user_variance')
movie_variance = ratings.groupby(ratings.movieId)['rating'].std().rename('movie_variance')

In [14]:
user_count = ratings.groupby(ratings.userId)['rating'].count().rename('user_count')
movie_count = ratings.groupby(ratings.movieId)['rating'].count().rename('movie_count')

In [15]:
rating_plus = ratings.merge(user_mean, on='userId').merge(movie_mean, on='movieId').merge(
                            user_median, on='userId').merge(movie_median, on='movieId').merge(
                            user_variance, on='userId').merge(movie_variance, on='movieId').merge(
                            user_count, on='userId').merge(movie_count, on='movieId')
rating_plus

Unnamed: 0,userId,movieId,rating,timestamp,user_mean,movie_mean,user_median,movie_median,user_variance,movie_variance,user_count,movie_count
0,1,1,4.0,964982703,4.366379,3.92093,5.00,4.0,0.800048,0.834859,232,215
1,5,1,4.0,847434962,3.636364,3.92093,4.00,4.0,0.990441,0.834859,44,215
2,7,1,4.5,1106635946,3.230263,3.92093,3.50,4.0,1.329594,0.834859,152,215
3,15,1,2.5,1510577970,3.448148,3.92093,3.50,4.0,1.133404,0.834859,135,215
4,17,1,4.5,1305696483,4.209524,3.92093,4.00,4.0,0.508490,0.834859,105,215
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,306,175199,4.0,1518380703,3.316964,4.00000,3.25,4.0,0.729048,,112,1
100832,306,183295,3.5,1518327334,3.316964,3.50000,3.25,3.5,0.729048,,112,1
100833,578,6751,2.5,1300990921,3.962963,2.50000,4.50,2.5,1.117397,,27,1
100834,578,56389,4.0,1300996756,3.962963,4.00000,4.50,4.0,1.117397,,27,1


In [16]:
rating_plus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   userId          100836 non-null  int64  
 1   movieId         100836 non-null  int64  
 2   rating          100836 non-null  float64
 3   timestamp       100836 non-null  int64  
 4   user_mean       100836 non-null  float64
 5   movie_mean      100836 non-null  float64
 6   user_median     100836 non-null  float64
 7   movie_median    100836 non-null  float64
 8   user_variance   100836 non-null  float64
 9   movie_variance  97390 non-null   float64
 10  user_count      100836 non-null  int64  
 11  movie_count     100836 non-null  int64  
dtypes: float64(7), int64(5)
memory usage: 10.0 MB


In [17]:
rating_plus.fillna(0, inplace=True)
rating_plus

Unnamed: 0,userId,movieId,rating,timestamp,user_mean,movie_mean,user_median,movie_median,user_variance,movie_variance,user_count,movie_count
0,1,1,4.0,964982703,4.366379,3.92093,5.00,4.0,0.800048,0.834859,232,215
1,5,1,4.0,847434962,3.636364,3.92093,4.00,4.0,0.990441,0.834859,44,215
2,7,1,4.5,1106635946,3.230263,3.92093,3.50,4.0,1.329594,0.834859,152,215
3,15,1,2.5,1510577970,3.448148,3.92093,3.50,4.0,1.133404,0.834859,135,215
4,17,1,4.5,1305696483,4.209524,3.92093,4.00,4.0,0.508490,0.834859,105,215
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,306,175199,4.0,1518380703,3.316964,4.00000,3.25,4.0,0.729048,0.000000,112,1
100832,306,183295,3.5,1518327334,3.316964,3.50000,3.25,3.5,0.729048,0.000000,112,1
100833,578,6751,2.5,1300990921,3.962963,2.50000,4.50,2.5,1.117397,0.000000,27,1
100834,578,56389,4.0,1300996756,3.962963,4.00000,4.50,4.0,1.117397,0.000000,27,1


Теперь нужно получить TF-IDF по жанрам. Убираю лишние элементы, разделяю строчки на отдельные слова, получаю список обозначений жанров и отдаю его в CountVectorizer

In [18]:
#'(no genres listed)' - это фраза, которая обозначает отсутствие жанровых обозначений, ее убираю вместе со всякими запятыми
def change_string(s):
    return ' '.join(s.replace('-', '').replace('(no genres listed)', '').split('|'))

In [19]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [20]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(movie_genres)
for i in X_counts[:10]:
    print(i)
    print('-' * 40)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 8)	1
----------------------------------------
  (0, 1)	1
  (0, 3)	1
  (0, 8)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
  (0, 7)	1
----------------------------------------
  (0, 4)	1
----------------------------------------
  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
----------------------------------------
  (0, 4)	1
  (0, 14)	1
----------------------------------------
  (0, 1)	1
  (0, 3)	1
----------------------------------------
  (0, 0)	1
----------------------------------------
  (0, 1)	1
  (0, 0)	1
  (0, 16)	1
----------------------------------------


In [21]:
count_vect.get_feature_names_out()

array(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'imax',
       'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war',
       'western'], dtype=object)

In [22]:
#Список жанров получился небольшим, и его можно использовать для названий колонок,
#когда придет время передавать данные в датасет
columns = sorted(count_vect.vocabulary_, key = lambda x: count_vect.vocabulary_[x])

In [23]:
#Передаю то, что получилось, в TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [24]:
for i in X_tfidf[:10]:
    print(i)
    print('-' * 40)

  (0, 8)	0.482990142708577
  (0, 4)	0.26758647689140014
  (0, 3)	0.5048454681396087
  (0, 2)	0.5162254711770092
  (0, 1)	0.41684567364693936
----------------------------------------
  (0, 8)	0.5936619434123594
  (0, 3)	0.620525172745643
  (0, 1)	0.5123612074824268
----------------------------------------
  (0, 14)	0.8210088907493954
  (0, 4)	0.5709154064399099
----------------------------------------
  (0, 14)	0.726240982959826
  (0, 7)	0.46640480307738325
  (0, 4)	0.5050154397005037
----------------------------------------
  (0, 4)	1.0
----------------------------------------
  (0, 16)	0.5420423542868653
  (0, 5)	0.6359470441562756
  (0, 0)	0.5493281743985542
----------------------------------------
  (0, 14)	0.8210088907493954
  (0, 4)	0.5709154064399099
----------------------------------------
  (0, 3)	0.7711121633813997
  (0, 1)	0.6366993258087036
----------------------------------------
  (0, 0)	1.0
----------------------------------------
  (0, 16)	0.5457299419583337
  (0, 1)	0.6

In [25]:
X_tfidf

<9742x19 sparse matrix of type '<class 'numpy.float64'>'
	with 22050 stored elements in Compressed Sparse Row format>

In [26]:
tfidf_transformer.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18'],
      dtype=object)

In [27]:
#Перевожу данные в формат датасета
new_x = pd.DataFrame(data=X_tfidf.toarray(),columns=columns)
new_x

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [28]:
#Объединяю этот датасет с колонкой movieId, чтобы восстановить связь между новыми фичами и фильмами
movies_genres = pd.concat([movies['movieId'], new_x], axis=1)
movies_genres

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,romance,scifi,thriller,war,western
0,1,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


Для тегов обработка должна быть сложнее. Это уже небольшие тексты, в них могут быть например ненужные слова (предлоги, союзы, местоимения), явно больше знаков препинания.

Поскольку тегов немного, я их просмотрела на предмет ненужных слов и их список передала в CountVectorizer как стоплист.

Выяснилось, что один человек может ставить одному фильму несколько тегов. Такие теги я сливала в один, чтобы в дальнейшем разреженные строки были хотя бы немножко менее разреженными.

In [29]:
for tag in sorted(tags['tag'].unique()):
    print(tag)

"artsy"
06 Oscar Nominated Best Movie - Animation
1900s
1920s
1950s
1960s
1970s
1980s
1990s
2001-like
2D animation
70mm
80's
AIDs
AS Byatt
AWESOME
Aardman
Academy award (Best Supporting Actress)
Action
Adam Sandler
Adrien Brody
Adventure
Afghanistan
Africa
Agatha Christie
Al Pacino
Alcatraz
Alfred Hitchcock
Alicia Vikander
Amazing Cinematography
American Indians
American propaganda
Amish
Amtrak
Amy Adams
Andrew Lloyd Weber
Andy Garcia
Andy Kaufman
Andy Samberg
Angelina Jolie
Animal movie
Animation
Anne Boleyn
Anne Hathaway
Anthony Hopkins
Arnold Schwarzenegger
Arthur C. Clarke
Arthur Miller
Astaire and Rogers
Atmospheric
Atomic bomb
Audrey Tautou
Australia
BEST PICTURE
Backwards. memory
Bad story
Bad writing
Batman
Beatles
Beautiful
Bechdel Test:Fail
Beethoven
Ben Affleck
Ben Kingsley
Ben Stiller
Bette Davis
Bible
Big Brothers
Bill Murray
Bittersweet
Black comedy
Borg
Boston
Boxing story
Brad Pitt
British
British gangster
Brittany Murphy
Broadway
Brooch
Bruce Willis
Bugs Bunny
Butler
C

In [30]:
def big_tag(df, row):
    small_tags = df[(df.userId == row.userId) & (df.movieId == row.movieId)]
    return ' '.join(small_tags.tag)

tags['big_tag'] = tags.apply(lambda x: big_tag(tags, x), axis=1)
tags_big = tags[['userId', 'movieId', 'big_tag']].drop_duplicates().reset_index()
tags_big

Unnamed: 0,index,userId,movieId,big_tag
0,0,2,60756,funny Highly quotable will ferrell
1,3,2,89774,Boxing story MMA Tom Hardy
2,6,2,106782,drugs Leonardo DiCaprio Martin Scorsese
3,9,7,48516,way too long
4,10,18,431,Al Pacino gangster mafia
...,...,...,...,...
1770,3677,606,6107,World War II
1771,3678,606,7382,for katie
1772,3679,606,7936,austere
1773,3680,610,3265,gun fu heroic bloodshed


In [31]:
def change_tags(tag):
    return ''.join(tag.replace('-', ' ').
                    replace('.', ' ').
                    replace('&', '').
                    replace('!', '').
                    replace('(', '').
                    replace(')', '').
                    replace('/', ' ').
                    replace(':', ' '))

In [32]:
tag_words = [change_tags(tag) for tag in tags_big.big_tag.values]

In [33]:
count_vect = CountVectorizer(stop_words=['am', 'is', 'was', 'i', 'me', 'my', 'you', 'your', 'it', 'this', 'a', 'the',
                                        'about', 'at', 'from', 'for', 'in', 'of', 'on', 'to', 'with', 'without',
                                        'and', 'as', 'but', 'or'])
X_tags = count_vect.fit_transform(tag_words)
print(len(count_vect.get_feature_names_out()))

for word in count_vect.get_feature_names_out()[150:160]:
    print(word)

1719
ben
bending
bernard
berry
besson
best
bette
better
bible
biblical


In [34]:
tfidf_transformer = TfidfTransformer()
X_tfidf_tags = tfidf_transformer.fit_transform(X_tags)

In [36]:
new_tags = pd.DataFrame(data=X_tfidf_tags.toarray())
new_tags.dropna(inplace=True)

In [37]:
tags_words = pd.concat([tags_big[['userId', 'movieId']], new_tags], axis=1)
tags_words

Unnamed: 0,userId,movieId,0,1,2,3,4,5,6,7,...,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718
0,2,60756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,89774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,106782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,48516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18,431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1770,606,6107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1771,606,7382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1772,606,7936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1773,610,3265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
final_dataset = rating_plus.merge(movies_genres, on='movieId').merge(tags_words, how='left', on=['userId', 'movieId'])
final_dataset.fillna(0, inplace=True)
final_dataset.dropna(inplace=True)
final_dataset

Unnamed: 0,userId,movieId,rating,timestamp,user_mean,movie_mean,user_median,movie_median,user_variance,movie_variance,...,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718
0,1,1,4.0,964982703,4.366379,3.92093,5.00,4.0,0.800048,0.834859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,4.0,847434962,3.636364,3.92093,4.00,4.0,0.990441,0.834859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,4.5,1106635946,3.230263,3.92093,3.50,4.0,1.329594,0.834859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,1,2.5,1510577970,3.448148,3.92093,3.50,4.0,1.133404,0.834859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1,4.5,1305696483,4.209524,3.92093,4.00,4.0,0.508490,0.834859,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,306,175199,4.0,1518380703,3.316964,4.00000,3.25,4.0,0.729048,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100832,306,183295,3.5,1518327334,3.316964,3.50000,3.25,3.5,0.729048,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100833,578,6751,2.5,1300990921,3.962963,2.50000,4.50,2.5,1.117397,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100834,578,56389,4.0,1300996756,3.962963,4.00000,4.50,4.0,1.117397,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
X = final_dataset.iloc[:, 4:]
y = final_dataset['rating']

Прежде чем передавать то, что получилось, в регрессоры, я применила метод PCA, чтобы уменьшить количество признаков.

In [47]:
pca = PCA(n_components = 175)
X_reduced = pca.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

In [51]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [52]:
mean_squared_error(regressor.predict(X_train), y_train, squared=False)

0.8036278499364371

In [53]:
mean_squared_error(regressor.predict(X_test), y_test, squared=False)

0.8082699386415821

In [92]:
forest = RandomForestRegressor(max_depth=6)
forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=6)

In [93]:
mean_squared_error(forest.predict(X_train), y_train, squared=False)

0.8069411023318525

In [94]:
mean_squared_error(forest.predict(X_test), y_test, squared=False)

0.8196625503142391

In [98]:
neighbors = KNeighborsRegressor(n_neighbors=10)
neighbors.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=10)

In [99]:
mean_squared_error(neighbors.predict(X_train), y_train, squared=False)

0.8065087285630725

In [101]:
mean_squared_error(neighbors.predict(X_test), y_test, squared=False)

0.8967973919064314

Все регрессоры показали довольно средние результаты. Меньше всех переобучилась линейная регрессия