In [6]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error

#from tensorflow.keras.layers import Input, Embedding, Dense, Reshape, Concatenate, Dropout, Activation, Lambda
#from keras.models import Model
#from keras.regularizers import l2

In [8]:
ratings = pd.read_csv('datasets/movie-ratings/ratings.csv', index_col=0)
movies = pd.read_csv('datasets/movie-ratings/movies.csv', index_col=0)

ratings_and_genres = ratings.join(movies, on='movie_id', how='left', lsuffix='_ratings', rsuffix='_movies')
ratings_and_genres = ratings_and_genres[['user_id', 'movie_id_ratings', 'rating', 'timestamp', 'year']]
ratings_and_genres.rename(columns={'movie_id_ratings': 'movie_id'}, inplace=True)

ratings_and_genres['date_time'] = pd.to_datetime(ratings_and_genres.timestamp, unit='s')
ratings_and_genres['rating_year'] = pd.DatetimeIndex(ratings_and_genres['date_time']).year

In [25]:
movie_genres = pd.read_csv('datasets/movie-ratings/te/movies.csv',sep=',')
movie_genres

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [26]:
movie_genres = movie_genres[['movieId', 'genres']]
movie['genres'] = movie_genres['genres'].replace(',','-', inplace=True)

In [6]:
movie_genres = pd.read_csv('../datasets/movie-ratings/movie_genres.csv', index_col=0)

movies_with_genre_ids = movie_genres.groupby('movie_id').aggregate(lambda x: tuple(x))
movies_with_genre_ids.reset_index()
movies_with_genre_ids.rename(columns={'genre_id': 'genre_ids'})

movies_ratings_genres = ratings_and_genres.join(movies_with_genre_ids, on='movie_id', 
                                                how='left', rsuffix='_genres')

movies_ratings_genres['genre_id'].fillna('', inplace=True) # empty tuple not supported, using a string instead

In [7]:
movies_ratings_genres['user'] = LabelEncoder().fit_transform(movies_ratings_genres['user_id'].values)
movies_ratings_genres['movie'] = LabelEncoder().fit_transform(movies_ratings_genres['movie_id'].values)

train_data = movies_ratings_genres[movies_ratings_genres.rating_year < 2018].copy(deep=True)
train_ratings = movies_ratings_genres[movies_ratings_genres.rating_year < 2018].rating

test_data = movies_ratings_genres[movies_ratings_genres.rating_year > 2017].copy(deep=True)
test_ratings = movies_ratings_genres[movies_ratings_genres.rating_year > 2017].rating

min_rating = np.min(train_ratings)
max_rating = np.max(train_ratings)

num_users = movies_ratings_genres['user_id'].nunique()
num_movies = movies_ratings_genres['movie_id'].nunique()
num_genres = movie_genres['genre_id'].nunique()

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

k_hot_encoder = MultiLabelBinarizer().fit(movies_ratings_genres['genre_id'])

train_genres = k_hot_encoder.transform(train_data['genre_id'])
test_genres = k_hot_encoder.transform(test_data['genre_id'])

train_year_values = train_data['year'].values.reshape(-1, 1)
train_rating_year_values = train_data['rating_year'].values.reshape(-1, 1)

test_year_values = test_data['year'].values.reshape(-1, 1)
test_rating_year_values = test_data['rating_year'].values.reshape(-1, 1)

year_imputer = SimpleImputer(strategy='mean').fit(train_year_values)
rating_year_imputer = SimpleImputer(strategy='mean').fit(train_rating_year_values)

year_normalizer = StandardScaler().fit(train_year_values)
rating_year_normalizer = StandardScaler().fit(train_rating_year_values)

train_data['normalised_year'] = year_normalizer.transform(year_imputer.transform(train_year_values))
train_data['normalised_rating_year'] = rating_year_normalizer.transform(
    rating_year_imputer.transform(train_rating_year_values))

test_data['normalised_year'] = year_normalizer.transform(year_imputer.transform(test_year_values))
test_data['normalised_rating_year'] = rating_year_normalizer.transform(
    rating_year_imputer.transform(test_rating_year_values))

X_train = [train_data['user'], train_data['movie'], train_genres, 
           train_data[['normalised_year', 'normalised_rating_year']]]
X_test = [test_data['user'], test_data['movie'], test_genres,
         test_data[['normalised_year', 'normalised_rating_year']]]

In [9]:
n_factors = 20

user_input = Input(shape=(1,))
user_embedding = Embedding(num_users, n_factors,
                           embeddings_initializer='he_normal',
                           embeddings_regularizer=l2(1e-6))(user_input)
user_embedding = Reshape((n_factors,))(user_embedding)

item_input = Input(shape=(1,))
item_embedding = Embedding(num_movies, n_factors,
                           embeddings_initializer='he_normal',
                           embeddings_regularizer=l2(1e-6))(item_input)
item_embedding = Reshape((n_factors,))(item_embedding)

genre_input = Input(shape=(num_genres,))

genres = Dense(10, kernel_initializer='he_normal')(genre_input)
genres = Activation('relu')(genres)
genres = Dropout(0.1)(genres)

years_input = Input(shape=(2,))

years = Dense(1, kernel_initializer='he_normal')(years_input)
years = Activation('relu')(years)
years = Dropout(0.1)(years)

network = Concatenate()([user_embedding, item_embedding, genres, years])
network = Dropout(0.05)(network)

network = Dense(10, kernel_initializer='he_normal')(network)
network = Activation('relu')(network)
network = Dropout(0.5)(network)

network = Dense(1, kernel_initializer='he_normal')(network)
network = Activation('sigmoid')(network)
network = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(network)

model = Model([user_input, item_input, genre_input, years_input], network)
model.compile('adam', 'mean_squared_error')


model.fit(X_train, train_ratings, epochs=25, verbose=1)

Epoch 1/25
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpu9f3qahb.py, line 48)
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: invalid syntax (tmpu9f3qahb.py, line 48)
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f6970582220>

In [10]:
from sklearn.metrics import mean_squared_error

mean_squared_error(train_ratings, model.predict(X_train))

0.5682278496360015

In [11]:
mean_squared_error(test_ratings, model.predict(X_test))

1.0908551537754905