<a href="https://colab.research.google.com/github/solobala/RMSL_9/blob/main/RMSL9_DZ3_latest_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Задание к теме «Гибридные рекомендательные системы»

Преподаватель: Наталья Баданина, Юлия Пономарева, Егор Шишковец
Что нужно делать?

Датасет ml-latest.
Вспомнить подходы, которые мы разбирали.
Выбрать понравившийся подход к гибридным системам.
Написать свою.
Материалы вы найдёте здесь.

Планирую: построить гибридную рекомендательную систему на основе:
1. CBRS (item features, user features) - регрессионная модель прогнозирования рейтинга для конкретного пользователя

2. Collaborative user-based filtering с использованием Surprise( отобрать n последних фильмов из числа тех, которым пользователь поставил высокие оценки. Получить их вектора. С помощью KNN отобрать из непросмотренных m ближайших соседей.

3. Полученную выборку передать в модель CBRS  и из нее уже отобрать все фильмы с рейтингом выше установленного порога)

4. Результат передать на уточнение в SVD (получить список рекомендаций из k фильмов)


# 0. Загрузка данных и импорт библиотек

In [3]:
import numpy as np
import pandas as pd
import scipy
from scipy import sparse
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from tqdm import tqdm
import pickle
from sklearn.neighbors import NearestNeighbors
from sklearn.base import RegressorMixin
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings("ignore")

#1. Пользовательские функции

In [4]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')) # Убираем лищние пробелы и тире и разбиваем строку на отдельные слова


In [6]:
def make_dataset() -> pd.DataFrame:
  # load dataset
  !wget  "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"   # Качаем архив выбранного датасета
  # unzip data from zip
  # !unzip ml-latest-small.zip
  # read tables
  tags = pd.read_csv('/content/ml-latest-small/tags.csv')
  movies = pd.read_csv('/content/ml-latest-small/movies.csv')
  ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
  # ratings.drop(columns=['timestamp'], inplace=True)
  tags.drop(columns=['timestamp'], inplace=True)
  # join tables
  df = movies.join(ratings.set_index('movieId'), on='movieId', how='right')
  # drop some columns
  df1 = pd.merge(df, tags, left_on=['userId','movieId'], right_on = ['userId','movieId'], how='left')
  return df1

In [7]:
def clean(df: pd.DataFrame) -> pd.DataFrame:
  # fill nan
  df['tag'] = df['tag'].fillna('no_tag')
  return df


In [8]:
def b_f_by_user(df: pd.DataFrame, userId: int, is_train: bool) -> tuple():
  tfidf_transformer2 = TfidfVectorizer()
  df = df[df['userId']==userId]
  df['genres'] = df['genres'].apply(lambda x: change_string(x))
  if df[(df['tag']!='no_tag')].shape[0] != 0:
    df['tag'] = df['tag'].apply(lambda x: change_string(x))
  ddf = df[['movieId', 'rating','timestamp']]
  y = df['rating'].to_numpy()

  if is_train:
    tfidf_transformer1 = TfidfVectorizer()
    tfidf_transformer2 = TfidfVectorizer()
    X_tfidf_1 = tfidf_transformer1.fit_transform(df['genres'])
    X_tfidf_2 = tfidf_transformer2.fit_transform(df['tag'])
    with open('trr1.pkl', 'wb') as f:
        pickle.dump(tfidf_transformer1, f)
    with open('trr2.pkl', 'wb') as f:
        pickle.dump(tfidf_transformer2, f)
  else:
     with open('trr1.pkl', 'rb') as f:
        tfidf_transformer1 = pickle.load(f)
     with open('trr2.pkl', 'rb') as f:
        tfidf_transformer2 = pickle.load(f)
     X_tfidf_1 = tfidf_transformer1.transform(df['genres'])
     X_tfidf_2 = tfidf_transformer2.transform(df['tag'])


  tfidf_transformer1 = TfidfVectorizer()
  X_tfidf_1 = tfidf_transformer1.fit_transform(df['genres'])
  # Проверяем, что у пользователя есть тэги
  X_transform = scipy.sparse.hstack([X_tfidf_1, X_tfidf_2 ])
  X1 = sparse.csr_matrix(X_transform)

  return ddf, X1, y, tfidf_transformer1, tfidf_transformer2

In [9]:
def build_features(df: pd.DataFrame, is_train: bool) -> tuple():
  # feature engineering
  df['genres'] = df['genres'].apply(lambda x: change_string(x))
  df['tag'] = df['tag'].apply(lambda x: change_string(x))
  y = df['rating'].to_numpy()
  if is_train:
    tfidf_transformer1 = TfidfVectorizer()
    tfidf_transformer2 = TfidfVectorizer()
    X_tfidf_1 = tfidf_transformer1.fit_transform(df['genres'])
    X_tfidf_2 = tfidf_transformer2.fit_transform(df['tag'])
    with open('tr1.pkl', 'wb') as f:
            pickle.dump(tfidf_transformer1, f)
    with open('tr2.pkl', 'wb') as f:
            pickle.dump(tfidf_transformer2, f)
  else:
     with open('tr1.pkl', 'rb') as f:
        tfidf_transformer1 = pickle.load(f)
     with open('tr2.pkl', 'rb') as f:
        tfidf_transformer2 = pickle.load(f)
     X_tfidf_1 = tfidf_transformer1.transform(df['genres'])
     X_tfidf_2 = tfidf_transformer2.transform(df['tag'])
  X_transform = scipy.sparse.hstack([X_tfidf_1, X_tfidf_2 ])
  # feature engineering
  ddf = df[['userId', 'movieId', 'rating']]
  mean_user_rating = ddf.groupby(by='userId')[['rating']].mean()
  mean_movie_rating = ddf.groupby(by='movieId')[['rating']].mean()
  median_user_rating = ddf.groupby(by='userId')[['rating']].median()
  median_movie_rating = ddf.groupby(by='movieId')[['rating']].median()
  var_user_rating = ddf.groupby(by='userId')[['rating']].var()
  var_movie_rating = ddf.groupby(by='movieId')[['rating']].var()
  std_user_rating = ddf.groupby(by='userId')[['rating']].std()
  std_movie_rating = ddf.groupby(by='movieId')[['rating']].std()
  ddf = ddf.join(mean_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_mean', how='left')
  ddf = ddf.join(mean_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_mean', how='left')
  mean_rating_user = ddf['rating_user_mean'].mean()
  mean_rating_movie = ddf['rating_movie_mean'].mean()
  ddf['rating_user_mean'] = ddf['rating_user_mean'].fillna(mean_rating_user)
  ddf['rating_movie_mean'] = ddf['rating_movie_mean'].fillna(mean_rating_movie)
  ddf = ddf.join(median_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_median', how='left')
  ddf = ddf.join(median_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_median', how='left')
  median_rating_user = ddf['rating_user_median'].mean()
  median_rating_movie = ddf['rating_movie_median'].mean()
  ddf['rating_user_median'] = ddf['rating_user_median'].fillna(median_rating_user)
  ddf['rating_movie_median'] = ddf['rating_movie_median'].fillna(median_rating_movie)
  ddf = ddf.join(var_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_var', how='left')
  ddf = ddf.join(var_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_var', how='left')
  var_rating_user = ddf['rating_user_var'].mean()
  var_rating_movie = ddf['rating_movie_var'].mean()
  ddf['rating_user_var'] = ddf['rating_user_var'].fillna(var_rating_user)
  ddf['rating_movie_var'] = ddf['rating_movie_var'].fillna(var_rating_movie)
  ddf = ddf.join(std_user_rating.reset_index('userId'), on='userId',lsuffix='', rsuffix='_user_std', how='left')
  ddf = ddf.join(std_movie_rating.reset_index('movieId'), on='movieId', lsuffix='', rsuffix='_movie_std', how='left')
  std_rating_user = ddf['rating_user_std'].mean()
  std_rating_movie = ddf['rating_movie_std'].mean()
  ddf['rating_user_std'] = ddf['rating_user_std'].fillna(std_rating_user)
  ddf['rating_movie_std'] = ddf['rating_movie_std'].fillna(std_rating_movie)
  ddf.drop(columns=[ 'userId_user_mean','movieId_movie_mean', 'userId_user_median', 'movieId_movie_median',	'userId_user_var', 'movieId_movie_var', 'userId_user_std', 'movieId_movie_std'], inplace=True)
  X = ddf.copy(deep=True)
  X.drop(columns=['userId', 'movieId', 'rating'], inplace=True)
  # scaling
  X['rating_user_mean'] = (X['rating_user_mean']-X['rating_user_mean'].mean())/(X['rating_user_mean'].max() - X['rating_user_mean'].min())
  X['rating_movie_mean'] = (X['rating_movie_mean'] - X['rating_movie_mean'].mean())/(X['rating_movie_mean'].max() - X['rating_movie_mean'].min())
  X['rating_user_median'] = (X['rating_user_median'] - X['rating_user_median'].mean())/(X['rating_user_median'].max() - X['rating_user_median'].min())
  X['rating_movie_median'] = (X['rating_movie_median'] - X['rating_movie_median'].mean())/(X['rating_movie_median'].max() - X['rating_movie_median'].min())
  X['rating_user_var'] = (X['rating_user_var'] - X['rating_user_var'].mean())/(X['rating_user_var'].max() - X['rating_user_var'].min())
  X['rating_movie_var'] = (X['rating_movie_var'] - X['rating_movie_var'].mean())/(X['rating_movie_var'].max() - X['rating_movie_var'].min())
  X1 = sparse.csr_matrix(X)
  X = scipy.sparse.hstack([X_transform, X1])
  return ddf, X, y, tfidf_transformer1, tfidf_transformer2

In [18]:
def train_model(X_train: scipy.sparse._csr.csr_matrix, y_train: np.ndarray) -> RegressorMixin:
  # fit model
  lr = LinearRegression()
  lr.fit(X_train, y_train)
  return lr

In [10]:
def predict_model(model: RegressorMixin,
                  X_test: scipy.sparse._csr.csr_matrix ) -> np.ndarray:
  # make predictions
  y_pred = model.predict(X_test)
  return y_pred

In [11]:
def evaluate_model(y_test, y_pred) -> None:
  # model evaluation with rmse
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print("RMSE: ", rmse)
  return rmse

#2. Построить рекомендации (регрессия, предсказываем оценку) на фичах TF-IDF - на тегах и жанрах; на фичах - средние оценки (+ median, variance и т. д.) пользователя и фильма

In [12]:
movies = pd.read_csv('/content/ml-latest-small/movies.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [13]:
tags = pd.read_csv('/content/ml-latest-small/tags.csv')
tags.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [14]:
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [None]:
all_users = ratings['userId'].unique()
all_users

In [19]:
def cbrs_all() -> tuple:
  # CBRS для среднего пользователя
  df = make_dataset()
  df = clean(df)
  df, X, y, _, _ = build_features(df,True)
  X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.2, random_state=42)
  model = train_model(X_train, y_train)
  y_pred = predict_model(model, X_test)
  whole_rmse = evaluate_model(y_test, y_pred)
  return df, model, whole_rmse

In [20]:
df, whole_model, whole_rmse = cbrs_all()

--2023-07-15 08:32:50--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.2’


2023-07-15 08:32:50 (2.67 MB/s) - ‘ml-latest-small.zip.2’ saved [978202/978202]

RMSE:  1.0162363777340064


In [21]:
def cbrs_private(userId: int) -> tuple:
  # CBRS для конкретного пользователя
  df = make_dataset()
  df = clean(df)
  dff, X, y, _, _ = b_f_by_user(df, userId, True)
  X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.2, random_state=42)
  model = train_model(X_train, y_train)
  y_pred = predict_model(model, X_test)
  private_rmse = evaluate_model(y_test, y_pred)
  return dff, model, private_rmse

In [22]:
userId = 1
dff, private_model, private_rmse = cbrs_private(userId)

--2023-07-15 08:33:00--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.3’


2023-07-15 08:33:01 (2.69 MB/s) - ‘ml-latest-small.zip.3’ saved [978202/978202]

RMSE:  0.7302236978647451


In [23]:
print(whole_rmse, private_rmse)

1.0162363777340064 0.7302236978647451


Если пользователь новый - используем модель для среднего пользователя.
Если пользователь старый:
Если RMSE для конкретного пользователя < RMSE для среднего пользователя - используем модель для конкретного пользователя, иначе используем модель для среднего пользователя

#3 Коллаборативная фильтрация item-to-item

In [24]:
# Строим словарь, ключи - названия фильмов, значения - векторы (рейтинги всех пользователей по фильму)
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
num_users = movies_with_ratings.userId.unique().shape[0]
movie_vector = {} # Строим вектор для фильма длиной 610 по к-ву пользователей

for movie, group in tqdm(movies_with_ratings.groupby('title')):
    movie_vector[movie] = np.zeros(num_users) # заполняем нулями

    for i in range(len(group.userId.values)):  # по каждому из пользователей ставивших рейтинг фильму
        u = group.userId.values[i] # Это идентификатор пользователя
        r = group.rating.values[i] # это рейтинг, который он поставил
        movie_vector[movie][int(u - 1)] = r # u-1, т.к это нумерация столбцов вектора, которая начинаетися с нуля, а пользоватили нумеруются с 1

100%|██████████| 9719/9719 [00:02<00:00, 3907.49it/s]


* отобрать n последних фильмов из числа тех, которым пользователь поставил высокие оценки.
* Получить их вектора.
* С помощью KNN отобрать из непросмотренных фильмов m ближайших соседей.

In [25]:
df.head()

Unnamed: 0,userId,movieId,rating,rating_user_mean,rating_movie_mean,rating_user_median,rating_movie_median,rating_user_var,rating_movie_var,rating_user_std,rating_movie_std
0,1,1,4.0,4.128571,3.441964,4.0,3.5,0.696218,0.769124,0.834397,0.876997
1,1,3,4.0,4.128571,2.357143,4.0,3.0,0.696218,0.72619,0.834397,0.852168
2,1,6,4.0,4.128571,3.185185,4.0,3.0,0.696218,0.955625,0.834397,0.977561
3,1,47,5.0,4.128571,3.482759,4.0,3.5,0.696218,0.740764,0.834397,0.860676
4,1,50,5.0,4.128571,4.0,4.0,4.0,0.696218,0.889498,0.834397,0.863867


In [26]:
df=make_dataset()
df=clean(df)
df['genres'] = df['genres'].apply(lambda x: change_string(x))
df['tag'] = df['tag'].apply(lambda x: change_string(x))


--2023-07-15 08:33:28--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.4’


2023-07-15 08:33:29 (2.69 MB/s) - ‘ml-latest-small.zip.4’ saved [978202/978202]



In [27]:
# К-во последних фильмов
n = 10
# выбираем n последних фильмов пользователя с рейтингом 5 и сохраняем их идентификаторы
last_movies = df[(df['userId']==userId) & (df['rating']==5) ].sort_values(by='timestamp', ascending=False)[:10]
last_ids = last_movies['movieId'].to_list()
last_titles = last_movies['title'].to_list()
# находим непросмотренные фильмы для выбранного пользователя

df1 = df[(df['userId']!=userId)]
df1.shape

(102445, 7)

In [28]:
last_titles  # Названия 10 последних фильмов, которые понравились пользователю

['Tombstone (1993)',
 'Canadian Bacon (1995)',
 'Messenger: The Story of Joan of Arc, The (1999)',
 'Pink Floyd: The Wall (1982)',
 'Good Morning, Vietnam (1987)',
 'Rob Roy (1995)',
 'Henry V (1989)',
 'M*A*S*H (a.k.a. MASH) (1970)',
 "Schindler's List (1993)",
 'Green Mile, The (1999)']

In [30]:
# Получим векторы для любимых фильмов пользователя
last_movie_vectors = []
for title in last_titles:
  last_movie_vectors.append(movie_vector[title])

In [31]:
# Для каждого из любимых фильмов найдем 50 ближайших соседей
from scipy.spatial.distance import cosine
all_titles = []
for title in last_titles:
  distances = []
  titles = []

  for key in tqdm(movie_vector.keys()):
      if key == title:
          continue

      titles.append(key)
      distances.append(cosine(movie_vector[title], movie_vector[key]))
  best_indexes = np.argsort(distances)[:50] # А это идентификаторы соответствующих фильмов
  best_movies = [(titles[i], distances[i]) for i in best_indexes]
  for m in best_movies:
      all_titles.append(m)
print(len(list(set(all_titles))))
for m in set(all_titles):
    print(m)

100%|██████████| 9719/9719 [00:00<00:00, 23647.41it/s]
100%|██████████| 9719/9719 [00:00<00:00, 14239.06it/s]
100%|██████████| 9719/9719 [00:00<00:00, 25232.02it/s]
100%|██████████| 9719/9719 [00:00<00:00, 14582.22it/s]
100%|██████████| 9719/9719 [00:00<00:00, 15981.67it/s]
100%|██████████| 9719/9719 [00:00<00:00, 20816.56it/s]
100%|██████████| 9719/9719 [00:00<00:00, 25372.24it/s]
100%|██████████| 9719/9719 [00:00<00:00, 22406.31it/s]
100%|██████████| 9719/9719 [00:00<00:00, 25055.76it/s]
100%|██████████| 9719/9719 [00:00<00:00, 25300.83it/s]


500
('Love and Death (1975)', 0.6161047391405763)
('Godfather, The (1972)', 0.4883623365958737)
('Action Jackson (1988)', 0.5384963686614612)
('Spaceballs (1987)', 0.6185390149482496)
('Johnny Dangerously (1984)', 0.6601265845531314)
('Dangerous Liaisons (1988)', 0.5999534540648228)
('Client, The (1994)', 0.6094699342464945)
('Sex and Lucia (Lucía y el sexo) (2001)', 0.5150436126781104)
('Moscow on the Hudson (1984)', 0.6171275766686588)
('I Am a Fugitive from a Chain Gang (1932)', 0.5895783440883031)
('Batman (1989)', 0.5363823716555465)
('Angel Eyes (2001)', 0.5903860730836266)
('Good Will Hunting (1997)', 0.5416942635349471)
('Kill Bill: Vol. 2 (2004)', 0.5685187839835413)
('Platoon (1986)', 0.6279799565388255)
('Clear and Present Danger (1994)', 0.5781548831933938)
('Fugitive, The (1993)', 0.5624122691887394)
('Legends of the Fall (1994)', 0.6100773774415201)
('Saving Private Ryan (1998)', 0.44746659158782554)
('Grumpy Old Men (1993)', 0.5320629202013392)
('Hoodlum (1997)', 0.49631

# 4.  Surprise и скрытые факторы: user_based

1.  В прошломы разделе выбирали фильмы на основании item-based features
2.  В этом попробуем использовать user-based подход и коллаборативную фильтрацию



In [32]:
%pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m768.0/772.0 kB[0m [31m30.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3097777 sha256=d309ab703052a244fecd682d6c1581a48993a1fcd2093ef0e3ab3c2315715c35
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise


In [33]:
from surprise import KNNWithMeans, KNNBasic, SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from operator import itemgetter

In [34]:
def generate_recommendation(uid, model, dataset, thresh=4.8, amount=500):
    all_titles = list(dataset['iid'].values) # Все названия
    users_seen_titles = dataset[dataset['uid'] == uid]['iid']  # ТЕ. что наш пользователь видел
    titles = np.array(list(set(all_titles) - set(users_seen_titles))) # ТО. что он не видел

    np.random.shuffle(titles)

    rec_list = []
    for title in titles:
        review_prediction = model.predict(uid=uid, iid=title) # Прогноз полный
        rating = review_prediction.est # рейтинг из прогноза

        if rating >= thresh:
            rec_list.append((title, round(rating, 2)))
    return sorted(rec_list, key=itemgetter(1))[-500:]

In [35]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [36]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
algo = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': True  # compute  similarities between users
})
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7c25f2f4fe20>

In [37]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8959


0.8959492188641812

In [38]:
knn_list = generate_recommendation(1, algo, dataset, thresh=4.8)

Таким образом, есть список фильмов, полученный с помощью item-to-item коллаборативной фильтрации и список фильмов, полученный с помощью KNNWithMeans User-based. Эти списки не совпадают, но  частично пересекаются. Задача -выбрать из обоих самые походящие фильмы. Попробуем использовать объединенный список на вход для svd

In [39]:
full_list =[]
for el in all_titles:
  full_list.append(el[0])
for el in knn_list:
  full_list.append(el[0])

In [40]:
duplicates = set()
repeated_elements = set()
for item in full_list:
    if item in duplicates:
        repeated_elements.add(item)
    else:
        duplicates.add(item)

In [42]:
repeated_elements # Это элементы из обоих списков

{'African Queen, The (1951)',
 'Aladdin (1992)',
 'Alien (1979)',
 'American Beauty (1999)',
 'American History X (1998)',
 'American President, The (1995)',
 'Apocalypse Now (1979)',
 'Apollo 13 (1995)',
 'Auto Focus (2002)',
 'Back to the Future (1985)',
 'Beautiful Mind, A (2001)',
 'Best Men (1997)',
 'Better Off Dead... (1985)',
 'Big Lebowski, The (1998)',
 'Born on the Fourth of July (1989)',
 'Braveheart (1995)',
 'Casablanca (1942)',
 "City Slickers II: The Legend of Curly's Gold (1994)",
 'Clear and Present Danger (1994)',
 'Client, The (1994)',
 'Cliffhanger (1993)',
 'Clockwork Orange, A (1971)',
 'Crimson Tide (1995)',
 'Crocodile Dundee (1986)',
 'Dancer Upstairs, The (2002)',
 'Dances with Wolves (1990)',
 'Dark Blue World (Tmavomodrý svet) (2001)',
 "Dead Men Don't Wear Plaid (1982)",
 'Dead Poets Society (1989)',
 'Dirty Dozen, The (1967)',
 'E.T. the Extra-Terrestrial (1982)',
 'Erik the Viking (1989)',
 'Excalibur (1981)',
 'Fight Club (1999)',
 'Firm, The (1993)',
 

In [43]:
len(repeated_elements)

94

Из 500 фильмов, отобранных обоими алгоритмами, совпали 92. Теперь можно пропустить общий список через модель пользователя и отобрать фильмы с предполагаемо самыми высокими рейтингами

# 4.  Использование результатов item-to-item (3) и user-based коллаборативной фильтрации(4) в модели (2)


In [46]:
df=make_dataset()
df=clean(df)
df['genres'] = df['genres'].apply(lambda x: change_string(x))
df['tag'] = df['tag'].apply(lambda x: change_string(x))
with open('trr1.pkl', 'rb') as f:
  tfidf_transformer1 = pickle.load(f)
with open('trr2.pkl', 'rb') as f:
  tfidf_transformer2 = pickle.load(f)
recommendations = list()
for title in list(repeated_elements):
  dff = df[df['title']==title]

  x1 = tfidf_transformer1.transform(dff['genres'])
  x2 = tfidf_transformer2.transform(dff['tag'])
  X = scipy.sparse.hstack([x1, x2])
  y_pred = private_model.predict(X)
  if y_pred[0]>4.8:
    recommendations.append((title,y_pred[0]))
    print((title,y_pred[0]))


--2023-07-15 08:42:14--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.2’


2023-07-15 08:42:14 (2.65 MB/s) - ‘ml-latest-small.zip.2’ saved [978202/978202]

('Apollo 13 (1995)', 4.887520306503109)
('Godfather, The (1972)', 4.811783278011128)
('Hope and Glory (1987)', 5.022502363425985)
('Shawshank Redemption, The (1994)', 4.811783278011128)
('Max (2002)', 5.022502363425985)
('American History X (1998)', 4.811783278011128)
('Shape of Things, The (2003)', 5.022502363425985)
('Hoodlum (1997)', 5.523272607141486)
('Regarding Henry (1991)', 5.022502363425985)
('Dead Poets Society (1989)', 5.022502363425985)
('Safety of Objects, The (2001)', 5.022502363425985)
('Five Easy Pieces (1970)', 5.0225023

In [45]:
len(recommendations)

36

Получили список из 36 фильмов, из которых надо оставить 5. Поищем скрытые факторы

In [49]:
indices=[]
for item in recommendations:
  indices.append(movies[movies['title']==item[0]].movieId.values[0])

In [50]:
df=make_dataset()
df=clean(df)

--2023-07-15 08:43:26--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.4’


2023-07-15 08:43:26 (2.67 MB/s) - ‘ml-latest-small.zip.4’ saved [978202/978202]



In [51]:
data = df.loc[df.movieId.isin(indices), ['title', 'userId', 'rating']]
data = data[data['userId']!=1]

In [52]:
dataset = pd.DataFrame({
    'uid': data.userId,
    'iid': data.title,
    'rating': data.rating
})

In [62]:
dataset.rating.min(), dataset.rating.max()

(0.5, 5.0)

In [63]:
reader = Reader(rating_scale=(dataset.rating.min(), dataset.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [64]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [75]:
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7c25bc5a0a90>

In [76]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.7221


0.7220670546448171

In [77]:
svd_list=[]
for item in recommendations:
  svd_list.append((item[0], algo.predict(uid=1.0, iid=item[0]).est))

In [78]:
for item in svd_list:
  if item[1]>4.2:
    print(item)

('Godfather, The (1972)', 4.250880287219493)
('Hope and Glory (1987)', 4.2006081302954215)
('Shawshank Redemption, The (1994)', 4.400411855505626)
('American History X (1998)', 4.2041794190255155)
('Five Easy Pieces (1970)', 4.283367998265415)
