## Deep learning hybrid recommendation system

In this approach, we will combine 1) the tf-idf representation of movies from content-based recommendation system, and 2) the user/movie embeddings from collaborative-filtering recommendation system, as a concatenated input to predict ratings.

In [1]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

import tensorflow as tf
import keras

from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

from scipy.sparse import vstack

Using TensorFlow backend.


In [2]:
# remove unnecessary TF logs
import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
# check keras and TF version used
print('TF Version:', tf.__version__)
print('Keras Version:', keras.__version__)

TF Version: 1.15.0
Keras Version: 2.2.5


## Load datasets

In [58]:
movies_df = pd.read_csv('./datasets/ml-latest-small/movies.csv')
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [59]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [60]:
ratings_df = pd.read_csv('./datasets/ml-latest-small/ratings.csv')
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [61]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [62]:
tags_df = pd.read_csv('./datasets/ml-latest-small/tags.csv')
tags_df.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## Process data

In [63]:
# filter out rarely rated movies and rarely rating users
min_movie_ratings = 10
min_user_ratings = 10

filter_movies = (ratings_df['movieId'].value_counts() > min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

filter_users = (ratings_df['userId'].value_counts() > min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# the get filtered data
mask = (ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))
ratings_df_filtered = ratings_df[mask]
del filter_movies, filter_users
ratings_df_filtered.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Create user/movie mappings

In [64]:
# copy ratings_df_filtered

df_filtered_cp = ratings_df_filtered.copy(deep=True)

In [65]:
# user, movie mappings
user_id_mapping = {id:i for i,id in enumerate(df_filtered_cp['userId'].unique())}
movie_id_mapping = {id:i for i,id in enumerate(df_filtered_cp['movieId'].unique())}

In [66]:
# map old ids to integer values
df_filtered_cp['userId'] = df_filtered_cp['userId'].map(user_id_mapping)
df_filtered_cp['movieId'] = df_filtered_cp['movieId'].map(movie_id_mapping)

## Movie metadata 

In [67]:
# remove the parenthes in movie titles
movies_df['title'] = movies_df.title.str.replace('([\(\)])', '')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji 1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men 1995,Comedy|Romance
3,4,Waiting to Exhale 1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II 1995,Comedy


In [68]:
# remove the pipe separator in genres
movies_df['genres'] = movies_df.genres.str.replace('(\|)', ' ')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance
3,4,Waiting to Exhale 1995,Comedy Drama Romance
4,5,Father of the Bride Part II 1995,Comedy


In [69]:
# merge all tags for each movie
movie_tags = tags_df[['movieId', 'tag']].groupby('movieId').agg(lambda x: set(x.tolist()))
movie_tags['tag'] = movie_tags['tag'].apply(lambda x: ' '.join(str(s) for s in x))
movie_tags.head()

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar fun
2,Robin Williams game magic board game fantasy
3,old moldy
5,remake pregnancy
7,remake


In [70]:
# join movies_df and movie_tags
movies_df = movies_df.set_index('movieId').join(movie_tags)
movies_df.head()

Unnamed: 0_level_0,title,genres,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,pixar fun
2,Jumanji 1995,Adventure Children Fantasy,Robin Williams game magic board game fantasy
3,Grumpier Old Men 1995,Comedy Romance,old moldy
4,Waiting to Exhale 1995,Comedy Drama Romance,
5,Father of the Bride Part II 1995,Comedy,remake pregnancy


In [71]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9742 non-null   object
 1   genres  9742 non-null   object
 2   tag     1572 non-null   object
dtypes: object(3)
memory usage: 624.4+ KB


In [72]:
# there are some nan values in tags, let's replace them with ' '
movies_df['tag'] = movies_df['tag'].fillna('')
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9742 non-null   object
 1   genres  9742 non-null   object
 2   tag     9742 non-null   object
dtypes: object(3)
memory usage: 624.4+ KB


In [73]:
movies_df['meta_info'] = movies_df['title'] + ' ' + movies_df['genres'] + ' ' + movies_df['tag']
movies_df.iloc[0, 3]

'Toy Story 1995 Adventure Animation Children Comedy Fantasy pixar fun'

In [74]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9742 non-null   object
 1   genres     9742 non-null   object
 2   tag        9742 non-null   object
 3   meta_info  9742 non-null   object
dtypes: object(4)
memory usage: 700.5+ KB


## Hybrid dataset

In [77]:
df_hybrid = (df_filtered_cp.set_index('movieId')
            .join(movies_df)
            .dropna()
            .drop(['title', 'genres', 'tag', 'meta_info'], axis=1)
            .reset_index().rename({'index': 'movieId'}, axis=1))
df_hybrid.head()

Unnamed: 0,movieId,userId,rating,timestamp
0,1,0,4.0,964981247
1,1,5,5.0,845554296
2,1,18,3.0,965707636
3,1,31,3.0,856736172
4,1,41,4.0,996221045


In [78]:
# set aside a small port of ratings_df for testing purpose
n = 10000

rng = np.random.default_rng(42)
permuted_indices = rng.permutation(df_hybrid.shape[0])

df_train = df_hybrid.iloc[permuted_indices[:-n],:]
df_test = df_hybrid.iloc[permuted_indices[-n:],:]
print(df_train.shape)
print(df_test.shape)

(50886, 4)
(10000, 4)


## Create TF-IDF vectors for train and test datasets

In [79]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(movies_df['meta_info'])

In [80]:
# map from movie_ids to indices in tfidf-matrix
movie_idx_mapping = {id:i for i,id in enumerate(movies_df.index)}

In [81]:
train_tfidf = []

for idx in tqdm(df_train['movieId'].values):
    index = movie_idx_mapping[idx]
    train_tfidf.append(tfidf_hybrid[index])
    
len(train_tfidf) 

100%|██████████| 50886/50886 [00:05<00:00, 8844.37it/s]


50886

In [82]:
# tfidf vectors for test data
test_tfidf = []

for idx in tqdm(df_test['movieId'].values):
    index = movie_idx_mapping[idx]
    test_tfidf.append(tfidf_hybrid[index])
    
len(test_tfidf) 

100%|██████████| 10000/10000 [00:01<00:00, 8549.85it/s]


10000

In [83]:
# stack the sparse matrices
train_tfidf = vstack(train_tfidf)
test_tfidf = vstack(test_tfidf)

train_tfidf.shape, test_tfidf.shape

((50886, 9949), (10000, 9949))

In [84]:
type(train_tfidf)

scipy.sparse.csr.csr_matrix

## Deep learning architecture

In [96]:
# parameters
user_embed_dim = 256
movie_embed_dim = 256
userid_input_shape = 1
movieid_input_shape = 1
tfidf_input_shape = tfidf_hybrid.shape[1]

In [97]:
# input layers
user_id_input = Input(shape=(userid_input_shape,), name='user')
movie_id_input = Input(shape=(movieid_input_shape,), name='movie')

# tfidf input layer
tfidf_input = Input(shape=(tfidf_input_shape,), name='tfidf', sparse=True)

In [98]:
# embedding layers
# user embedding
user_embedding = Embedding(output_dim=user_embed_dim,
                          input_dim=len(user_id_mapping),
                           input_length=userid_input_shape,
                           name='user_embedding')(user_id_input)

movie_embedding = Embedding(output_dim=movie_embed_dim,
                           input_dim=len(movie_id_mapping),
                           input_length=movieid_input_shape,
                           name='movie_embedding')(movie_id_input)

In [99]:
# further transform tfidf with Dense layers
tfidf_vectors = Dense(256, activation='relu')(tfidf_input)
tfidf_vectors = Dense(128, activation='relu')(tfidf_vectors)

In [100]:
# Reshape
user_vectors = Reshape([user_embed_dim])(user_embedding)
movie_vectors = Reshape([movie_embed_dim])(movie_embedding)

In [101]:
# concate all inputs
hybrid_layer = Concatenate()([user_vectors, movie_vectors, tfidf_vectors])

In [102]:
# add fully connected layers to predict ratings
dense = Dense(256, activation='relu')(hybrid_layer)
dense = Dropout(0.25)(dense)
ouput = Dense(1)(dense)


In [103]:
# create model
model = Model(inputs=[user_id_input, movie_id_input, tfidf_input], outputs=ouput)
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
tfidf (InputLayer)              (None, 9949)         0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 256)       156160      user[0][0]                       
____________________________________________________________________________________________

## train and test

In [104]:
batch_size=500
epochs = 20
X = [df_train['userId'], df_train['movieId'], train_tfidf]
y = df_train['rating']
model.fit(X, y,
         batch_size=batch_size,
         epochs=epochs,
         validation_split=0.1,
         shuffle=True)

Train on 45797 samples, validate on 5089 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe50adf0a10>

In [105]:
# test dataset
X_test = [df_test['userId'], df_test['movieId'], test_tfidf]
y_true = df_test['rating'].values

In [106]:
# test result
y_pred = model.predict(X_test).ravel()
y_pred = list(map(lambda x: 1.0 if x<1 else 5.0 if x>5.0 else x, y_pred))

# rmse
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'RMSE on test set is: {rmse}')

RMSE on test set is: 0.8397750207132764


In [108]:
# show comparison on ratings
results = pd.DataFrame({
    'userId': df_test['userId'].values,
    'movieId': df_test['movieId'].values,
    'title':[movies_df['title'].iloc[i] for i in df_test['movieId']],
    'predicted_rating': np.round(y_pred, 1),
    'actual_rating': y_true
})

results.head(20)

Unnamed: 0,userId,movieId,title,predicted_rating,actual_rating
0,306,31,Twelve Monkeys a.k.a. 12 Monkeys 1995,3.2,3.0
1,572,1119,Absolute Power 1997,4.3,5.0
2,495,1957,SLC Punk! 1998,3.6,3.5
3,381,1037,Vampire in Venice Nosferatu a Venezia Nosferat...,3.9,4.0
4,317,907,"Clockwork Orange, A 1971",4.1,5.0
5,225,82,Vampire in Brooklyn 1995,4.2,4.5
6,379,605,Stalingrad 1993,4.8,3.0
7,379,880,Delicatessen 1991,4.7,3.0
8,224,293,Underneath 1995,4.1,4.0
9,352,19,Money Train 1995,3.9,4.0
