<a href="https://colab.research.google.com/github/styxx216/ML/blob/main/rec_sis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Выполнение задания

**1) Выберете метрику и обоснуйте этот выбор**

Статистическая метрика Средняя абсолютная ошибка (MAE)  — это мера отклонения рекомендации от фактического рейтинга фильма от пользователя. Чем ниже значение MAE, тем точнее механизм рекомендаций. Эта метрика даёт представление о том, насколько точны наши прогнозы рейтинга пользователя и насколько точны наши рекомендации.

**2) Способ разбиения на тестовую и обучающую выборку**

Можно использовать случайное разбиение. Используем соотношение: 80% - обучающая выборка и 20%  - тестовая выборка.

**3) Сходимость обучения и настройка важных гиперпараметров модели.**

Предлагается проанализировать сходимость обучения в зависимости
от числа латентных факторов в разложении матрицы, размера батча и числа эпох.

**4) Выбрать лучшую модель**

Лучшая модель выбирается по результатам их взаимодействия с тестовой выборкой


#Задание

In [44]:
!pip install lightfm



In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from lightfm.datasets import fetch_movielens
from lightfm import LightFM

#model selection
import keras
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense , merge



from keras.layers.merge import dot
from keras.models import Model


# specifically for deeplearning.
from keras.layers import Dropout, Flatten,Activation,Input,Embedding
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
import tensorflow as tf
import random as rn
from IPython.display import SVG

from tensorflow.keras.optimizers import Adam

In [46]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [47]:
PATH_GENOME_SCORES = '/content/drive/MyDrive/datasets/MovieLens/genome_scores.csv'
PATH_GENOME_TAGS = '/content/drive/MyDrive/datasets/MovieLens/genome_tags.csv'
PATH_LINK = '/content/drive/MyDrive/datasets/MovieLens/link.csv'
PATH_MOVIE = '/content/drive/MyDrive/datasets/MovieLens/movie.csv'
PATH_RATING = '/content/drive/MyDrive/datasets/MovieLens/rating.csv'
PATH_TAG = '/content/drive/MyDrive/datasets/MovieLens/tag.csv'

In [48]:
train_test_split = 0.2

In [49]:
genome_scores = pd.read_csv(PATH_GENOME_SCORES)
tags_df = pd.read_csv(PATH_TAG)
genome_tags = pd.read_csv(PATH_GENOME_TAGS)
ratings = pd.read_csv(PATH_RATING)

In [50]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


#SVD-like алгоритм


In [51]:
ratings.userId = ratings.userId.astype('category').cat.codes.values
ratings.movieId = ratings.movieId.astype('category').cat.codes.values

In [52]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

ratings['userId'] = ratings['userId'].apply(lambda x: userid2idx[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x: movieid2idx[x])
split = np.random.rand(len(ratings)) < 0.8
train = ratings[split]
test = ratings[~split]
print(train.shape , test.shape)

(16000481, 4) (3999782, 4)


In [53]:
index=list(ratings['userId'].unique())
columns=list(ratings['movieId'].unique())
index=sorted(index)
columns=sorted(columns)
n_movies=len(index)
n_users=len(columns)
k = 64

In [54]:
user_input = Input(shape=(1,),name='user_input',dtype='int64')
user_embedding = Embedding(n_users, k, name='user_embedding')(user_input)
user_vec = Flatten(name='FlattenUsers')(user_embedding)

In [55]:
movie_input=Input(shape=(1,),name='movie_input',dtype='int64')
movie_embedding=Embedding(n_movies,k,name='movie_embedding')(movie_input)
movie_vec=Flatten(name='FlattenMovies')(movie_embedding)

In [56]:
sim=dot([user_vec,movie_vec],name='Simalarity-Dot-Product',axes=1)
model =keras.models.Model([user_input, movie_input],sim)

In [57]:
model.compile(optimizer=Adam(lr=1e-4),loss='mse')

  super(Adam, self).__init__(name, **kwargs)


In [58]:
print(train.shape)
batch_size=62
epochs=10

(16000481, 4)


In [None]:
History = model.fit([train.userId,train.movieId],train.rating, batch_size=batch_size,epochs =epochs, validation_data = ([test.userId,test.movieId],test.rating),verbose = 1)

Epoch 1/10
Epoch 2/10

# LightFM


In [None]:
data = lightfm.datasets.fetch_movielens(min_rating=4.0)

#print training and testing data
print(repr(data['train']))
print(repr(data['test']))

model = LightFM(loss='warp')
model.fit(data['train'], epochs=10, num_threads=2)