code is from here: https://heartbeat.fritz.ai/build-train-and-deploy-a-book-recommender-system-using-keras-tensorflow-js-b96944b936a7

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
ratings = pd.read_pickle("./Data/small_random_selection_longform.pkl")
# ratings = pd.read_pickle("./Data/moderate_users_longform.pkl")
user_stats = pd.read_pickle("./Data/moderate_user_stats.pkl")

In [4]:
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['user_id'].values)
n_users = ratings['user'].nunique()
n_users

37235

In [5]:
item_enc = LabelEncoder()
ratings['movie'] = item_enc.fit_transform(ratings['movie_id'].values)
n_movies = ratings['movie'].nunique()
n_movies

29758

In [6]:
ratings.head()

Unnamed: 0,movie_id,rating_score,user_id,user,movie
0,176,2,90476760,33647,136
1,20183,5,21347583,7941,5700
2,1819,5,24443276,9109,1586
3,2388,5,64167123,24036,2061
5,112482,3,55009665,20693,22140


In [7]:
def center_ratings(ratings_df, users_df):
    ratings_df['centered_rating'] = None
    for row in ratings_df.index:
        mean_val = users_df.at[ratings_df.at[row, 'user_id'], 'mean_score']
        ratings_df.at[row, 'centered_rating'] = ratings_df.at[row, 'rating_score'] - mean_val
        
    return ratings_df

In [8]:
ratings = center_ratings(ratings, user_stats)
ratings.head()

Unnamed: 0,movie_id,rating_score,user_id,user,movie,centered_rating
0,176,2,90476760,33647,136,-0.931507
1,20183,5,21347583,7941,5700,1.596154
2,1819,5,24443276,9109,1586,0.867647
3,2388,5,64167123,24036,2061,1.558824
5,112482,3,55009665,20693,22140,-0.219512


In [None]:
sns.histplot(data=ratings, x='centered_rating', bins=10)

plt.show()

In [19]:
X_train, X_test = train_test_split(ratings[['movie', 'user', 'centered_rating']], test_size=0.5, random_state=42)

In [20]:
X_train.shape, X_test.shape

((277577, 3), (277577, 3))

In [21]:
nmovie_id = ratings.movie_id.nunique()
nuser_id = ratings.user_id.nunique()

In [22]:
del model

In [23]:
input_movies = keras.layers.Input(shape=[1])
embed_movies = keras.layers.Embedding(nmovie_id+1, 15)(input_movies)
movies_out = keras.layers.Flatten()(embed_movies)

input_users = keras.layers.Input(shape=[1])
embed_users = keras.layers.Embedding(nuser_id+1, 15)(input_users)
users_out = keras.layers.Flatten()(embed_users)

conc_layer = keras.layers.Concatenate()([movies_out, users_out])
x = keras.layers.Dense(32, activation='relu')(conc_layer)
dropout = x = keras.layers.Dropout(rate=.3, seed=42)(x)
# dense2 = x = keras.layers.Dense(32, activation='relu')(x)
dense3 = x = keras.layers.Dense(16, activation='relu')(x)
x_out = x = keras.layers.Dense(1, activation='relu')(x)


model = keras.Model([input_movies, input_users], x_out)

In [24]:
opt = tf.optimizers.Adam(learning_rate=.001)
model.compile(optimizer=opt, loss='mean_squared_error', metrics = keras.metrics.RootMeanSquaredError())

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 15)        446385      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 15)        558540      input_4[0][0]                    
____________________________________________________________________________________________

In [None]:
# ratings.groupby('rating_score').count()['movie_id'].apply(lambda x: 1/x).reset_index(drop=True)

In [None]:
# class_weights = dict(ratings.groupby('rating_score').count()['movie_id'].apply(lambda x: 1/x).reset_index(drop=True))

In [None]:
# class_weights

In [25]:
hist = model.fit([X_train.movie, X_train.user], 
                 X_train.centered_rating.astype(float), 
                 batch_size=64, epochs=20, verbose=1, validation_split=.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
train_loss = hist.history['root_mean_squared_error']
val_loss = hist.history['val_root_mean_squared_error']
plt.plot(train_loss, color='r', label='Train RMSE')
plt.plot(val_loss, color='b', label='Validation RMSE')
plt.title("Train and Validation RMSE Curve")
plt.legend()
plt.show()

In [None]:
ratings.groupby('user_id').count().sort_values(by='movie_id',ascending=False)

In [None]:
ratings[ratings.user_id==74769593]

In [None]:
movie_array = ratings[ratings.user_id==74769593].movie.values.astype(int)
user = np.array([int(27853) for n in range(len(movie_array))])

In [None]:
movie_array.shape, movie_array.dtype, user.shape, user.dtype

In [None]:
pred = model.predict([movie_array, user])

In [None]:
pred.flatten()

In [None]:
list(zip(zip(pred.flatten(), ratings[ratings.user_id==74769593].centered_rating.values),movie_array))