<a href="https://colab.research.google.com/github/solobala/RMSL_9/blob/main/RMSL9_DZ1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать датасет MovieLens.
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

  *   TF-IDF на тегах и жанрах;
  *   средние оценки (+ median, variance и т. д.) пользователя и фильма.


3. Оценить RMSE на тестовой выборке.

# 0. Загрузка данных и импорт библиотек

In [None]:
import numpy as np
import pandas as pd
import scipy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.base import RegressorMixin
from sklearn.metrics import mean_squared_error

#1. Пользовательские функции

In [None]:
def change_string(s: str) -> str:
    # Убираем лищние пробелы и тире и разбиваем строку на отдельные слова, затем сохраняем в строку через пробел
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
def make_dataset() -> pd.DataFrame:
  # load dataset
  !wget  "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"   # Качаем архив выбранного датасета
  # unzip data from zip
  !unzip ml-latest-small.zip
  # read tables
  tags = pd.read_csv('/content/ml-latest-small/tags.csv')
  movies = pd.read_csv('/content/ml-latest-small/movies.csv')
  ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
  ratings.drop(columns=['timestamp'], inplace=True)
  tags.drop(columns=['timestamp'], inplace=True)
  # join tables
  df = ratings.join(movies.set_index('movieId'), on='movieId', how='left')
  # drop some columns
  df = pd.merge(df, tags, left_on=['userId','movieId'], right_on = ['userId','movieId'], how='left')
  return df

In [None]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
  # fill nan
  df['tag'] = df['tag'].fillna('no_tag')
  return df

In [None]:
def build_features(df: pd.DataFrame) -> tuple():
  # feature engineering признаков, связанных с item

  # Убираем лищние пробелы и тире, разбиваем строку жанров на отдельные слова и снова сохраняем в строку через пробел
  df['genres'] = df['genres'].apply(lambda x: change_string(x))

  # Убираем лищние пробелы и тире, разбиваем строку тэгов на отдельные слова и снова сохраняем в строку через пробел
  df['tag'] = df['tag'].apply(lambda x: change_string(x))

  # рейтинги преобразуем в numpy
  y = df['rating'].to_numpy()

  # ВЫбираем признаки - жанр и тэг
  X = df[['genres', 'tag']]

  # трансформируем в векторы, конкатенируем
  count_vect = CountVectorizer()
  X_counts_1 = count_vect.fit_transform(X['genres'])
  X_counts_2 = count_vect.fit_transform(X['tag'])

  tfidf_transformer = TfidfTransformer()
  X_tfidf_1 = tfidf_transformer.fit_transform(X_counts_1)
  X_tfidf_2 = tfidf_transformer.fit_transform(X_counts_2)

  X_transform = scipy.sparse.hstack([X_tfidf_1, X_tfidf_2 ])

  return X_transform, y

In [None]:
def b_f(ddf: pd.DataFrame)-> tuple():
  # feature engineering Признаков, связанных с user

  mean_user_rating = ddf.groupby(by='userId')[['rating']].mean()
  mean_movie_rating = ddf.groupby(by='movieId')[['rating']].mean()
  ddf = ddf.join(mean_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user', how='left')
  ddf = ddf.join(mean_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie', how='left')

  # Среднее значение рейтинга для юзера
  rating_user = ddf['rating_user'].mean()
  # Среднее значение рейтинга для итема
  rating_movie = ddf['rating_movie'].mean()

  # заполняем пропуски
  ddf['rating_user'] = ddf['rating_user'].fillna(rating_user)
  ddf['rating_movie'] = ddf['rating_movie'].fillna(rating_movie)
  y = ddf['rating']
  X = ddf.drop(columns=['userId', 'movieId', 'rating', 'userId_user', 'movieId_movie'])
  # scaling
  X['rating_user'] = (X['rating_user']-X['rating_user'].mean())/(X['rating_user'].max() - X['rating_user'].min())
  X['rating_movie'] = (X['rating_movie'] - X['rating_movie'].mean())/(X['rating_movie'].max() - X['rating_movie'].min())
  return X,y

In [None]:
def train_model(X: scipy.sparse._csr.csr_matrix, y: np.ndarray) -> RegressorMixin:
  # fit model - линейная регрессия
  lr = LinearRegression()
  lr.fit(X_train, y_train)
  return lr

In [None]:
def predict_model(model: RegressorMixin,
                  X_test: scipy.sparse._csr.csr_matrix ) -> np.ndarray:
  # make predictions
  y_pred = model.predict(X_test)
  return y_pred

In [None]:
def evaluate_model(y_test, y_pred) -> None:
  # model evaluation with rmse
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print("RMSE: ", rmse)


#2. Построить рекомендации (регрессия, предсказываем оценку) на фичах TF-IDF - на тегах и жанрах;

In [None]:
df = make_dataset()

--2023-06-25 22:04:25--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2023-06-25 22:04:26 (1.53 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [None]:
df = clean(df)
X_transform, y = build_features(df)
X_train, X_test, y_train, y_test =\
  train_test_split(X_transform, y, test_size=0.2, random_state=42)
model = train_model(X_train, y_train)
y_pred = predict_model(model, X_test)
evaluate_model(y_test, y_pred)

RMSE:  1.0182096860235028


# 3. Построить рекомендации (регрессия, предсказываем оценку) на фичах - средние оценки (+ median, variance и т. д.) пользователя и фильма

In [None]:
ddf = df[['userId', 'movieId', 'rating']]
X, y = b_f(ddf)
X_train, X_test, y_train, y_test =\
  train_test_split(X, y, test_size=0.2, random_state=42)
model = train_model(X_train, y_train)
y_pred = predict_model(model, X_test)
evaluate_model(y_test, y_pred)

RMSE:  1.0428823839999444
