<a href="https://colab.research.google.com/github/solobala/RMSL_9/blob/main/RMSL9_DZ1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать датасет MovieLens.
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

  *   TF-IDF на тегах и жанрах;
  *   средние оценки (+ median, variance и т. д.) пользователя и фильма.


3. Оценить RMSE на тестовой выборке.

# 0. Загрузка данных и импорт библиотек

In [46]:
import numpy as np
import pandas as pd
import scipy
from scipy import sparse
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.base import RegressorMixin
from sklearn.metrics import mean_squared_error

#1. Пользовательские функции

In [2]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')) # Убираем лищние пробелы и тире и разбиваем строку на отдельные слова

In [3]:
def make_dataset() -> pd.DataFrame:
  # load dataset
  !wget  "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"   # Качаем архив выбранного датасета
  # unzip data from zip
  !unzip ml-latest-small.zip
  # read tables
  tags = pd.read_csv('/content/ml-latest-small/tags.csv')
  movies = pd.read_csv('/content/ml-latest-small/movies.csv')
  ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
  ratings.drop(columns=['timestamp'], inplace=True)
  tags.drop(columns=['timestamp'], inplace=True)
  # join tables
  df = ratings.join(movies.set_index('movieId'), on='movieId', how='left')
  # drop some columns
  df = pd.merge(df, tags, left_on=['userId','movieId'], right_on = ['userId','movieId'], how='left')
  return df

In [4]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
  # fill nan
  df['tag'] = df['tag'].fillna('no_tag')
  return df

In [82]:
def build_features(df: pd.DataFrame) -> tuple():
  # feature engineering
  df['genres'] = df['genres'].apply(lambda x: change_string(x))
  df['tag'] = df['tag'].apply(lambda x: change_string(x))
  y = df['rating'].to_numpy()

  tfidf_transformer = TfidfVectorizer()
  X_tfidf_1 = tfidf_transformer.fit_transform(df['genres'])
  X_tfidf_2 = tfidf_transformer.fit_transform(df['tag'])
  X_transform = scipy.sparse.hstack([X_tfidf_1, X_tfidf_2 ])
  # feature engineering
  ddf = df[['userId', 'movieId', 'rating']]
  mean_user_rating = ddf.groupby(by='userId')[['rating']].mean()
  mean_movie_rating = ddf.groupby(by='movieId')[['rating']].mean()
  median_user_rating = ddf.groupby(by='userId')[['rating']].median()
  median_movie_rating = ddf.groupby(by='movieId')[['rating']].median()
  var_user_rating = ddf.groupby(by='userId')[['rating']].var()
  var_movie_rating = ddf.groupby(by='movieId')[['rating']].var()
  std_user_rating = ddf.groupby(by='userId')[['rating']].std()
  std_movie_rating = ddf.groupby(by='movieId')[['rating']].std()
  ddf = ddf.join(mean_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_mean', how='left')
  ddf = ddf.join(mean_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_mean', how='left')
  mean_rating_user = ddf['rating_user_mean'].mean()
  mean_rating_movie = ddf['rating_movie_mean'].mean()
  ddf['rating_user_mean'] = ddf['rating_user_mean'].fillna(mean_rating_user)
  ddf['rating_movie_mean'] = ddf['rating_movie_mean'].fillna(mean_rating_movie)
  ddf = ddf.join(median_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_median', how='left')
  ddf = ddf.join(median_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_median', how='left')
  median_rating_user = ddf['rating_user_median'].mean()
  median_rating_movie = ddf['rating_movie_median'].mean()
  ddf['rating_user_median'] = ddf['rating_user_median'].fillna(median_rating_user)
  ddf['rating_movie_median'] = ddf['rating_movie_median'].fillna(median_rating_movie)
  ddf = ddf.join(var_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_var', how='left')
  ddf = ddf.join(var_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_var', how='left')
  var_rating_user = ddf['rating_user_var'].mean()
  var_rating_movie = ddf['rating_movie_var'].mean()
  ddf['rating_user_var'] = ddf['rating_user_var'].fillna(var_rating_user)
  ddf['rating_movie_var'] = ddf['rating_movie_var'].fillna(var_rating_movie)
  ddf = ddf.join(std_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_std', how='left')
  ddf = ddf.join(std_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_std', how='left')
  std_rating_user = ddf['rating_user_std'].mean()
  std_rating_movie = ddf['rating_movie_std'].mean()
  ddf['rating_user_std'] = ddf['rating_user_std'].fillna(std_rating_user)
  ddf['rating_movie_std'] = ddf['rating_movie_std'].fillna(std_rating_movie)
  ddf.drop(columns=['userId', 'movieId', 'rating', 'userId_user_mean','movieId_movie_mean', 'userId_user_median', 'movieId_movie_median',	'userId_user_var', 'movieId_movie_var', 'userId_user_std', 'movieId_movie_std'], inplace=True)
  X = ddf
  # scaling
  X['rating_user_mean'] = (X['rating_user_mean']-X['rating_user_mean'].mean())/(X['rating_user_mean'].max() - X['rating_user_mean'].min())
  X['rating_movie_mean'] = (X['rating_movie_mean'] - X['rating_movie_mean'].mean())/(X['rating_movie_mean'].max() - X['rating_movie_mean'].min())
  X['rating_user_median'] = (X['rating_user_median'] - X['rating_user_median'].mean())/(X['rating_user_median'].max() - X['rating_user_median'].min())
  X['rating_movie_median'] = (X['rating_movie_median'] - X['rating_movie_median'].mean())/(X['rating_movie_median'].max() - X['rating_movie_median'].min())
  X['rating_user_var'] = (X['rating_user_var'] - X['rating_user_var'].mean())/(X['rating_user_var'].max() - X['rating_user_var'].min())
  X['rating_movie_var'] = (X['rating_movie_var'] - X['rating_movie_var'].mean())/(X['rating_movie_var'].max() - X['rating_movie_var'].min())
  X1 = sparse.csr_matrix(X)
  X = scipy.sparse.hstack([X_transform, X1])
  return X_transform, y

In [7]:
def train_model(X: scipy.sparse._csr.csr_matrix, y: np.ndarray) -> RegressorMixin:
  # fit model
  lr = LinearRegression()
  lr.fit(X_train, y_train)
  return lr

In [8]:
def predict_model(model: RegressorMixin,
                  X_test: scipy.sparse._csr.csr_matrix ) -> np.ndarray:
  # make predictions
  y_pred = model.predict(X_test)
  return y_pred

In [9]:
def evaluate_model(y_test, y_pred) -> None:
  # model evaluation with rmse
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print("RMSE: ", rmse)


#2. Построить рекомендации (регрессия, предсказываем оценку) на фичах TF-IDF - на тегах и жанрах; на фичах - средние оценки (+ median, variance и т. д.) пользователя и фильма

In [15]:
df = make_dataset()

--2023-07-13 17:20:03--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.1’


2023-07-13 17:20:04 (4.56 MB/s) - ‘ml-latest-small.zip.1’ saved [978202/978202]

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ml-latest-small/tags.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ml-latest-small/ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ml-latest-small/README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ml-latest-small/movies.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [50]:
df = clean(df)

In [None]:
X_transform, y = build_features(df)


In [84]:
X_train, X_test, y_train, y_test =\
  train_test_split(X_transform, y, test_size=0.2, random_state=42)
model = train_model(X_train, y_train)
y_pred = predict_model(model, X_test)
evaluate_model(y_test, y_pred)

RMSE:  0.9998417366050728
