In [1]:
import tensorflow as tf
# from tensorboard.plugins.hparams import api as hp
from tensorflow import keras

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sqlalchemy import create_engine
conn = create_engine('sqlite:///Data//mubi_db.sqlite').connect()

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
q = "SELECT movie_id, rating_score, user_id FROM ratings;"
ratings = pd.read_sql_query(q, conn)

In [6]:
ratings.shape

(15520005, 3)

In [7]:
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['user_id'].values)
n_users = ratings['user'].nunique()

item_enc = LabelEncoder()
ratings['movie'] = item_enc.fit_transform(ratings['movie_id'].values)
n_movies = ratings['movie'].nunique()

nmovie_id = ratings.movie_id.nunique()
nuser_id = ratings.user_id.nunique()

n_users, n_movies

(451757, 142698)

In [8]:
ratings.shape

(15520005, 5)

In [9]:
X_train, X_test = train_test_split(ratings[['movie', 'user', 'rating_score']], test_size=0.3, random_state=42)

In [10]:
X_train.shape, X_test.shape

((10864003, 3), (4656002, 3))

In [17]:
X_train.head()

Unnamed: 0,movie,user,rating_score
275468,1366,322565,4.0
1539949,1961,135338,3.0
12073651,141,278761,2.0
3600240,3687,272869,4.0
7978075,25357,283006,3.0


In [23]:
def train_test_model(output_bias):
    
    input_movies = keras.layers.Input(shape=[1])
    embed_movies = keras.layers.Embedding(nmovie_id+1, 15)(input_movies)
    movies_out = keras.layers.Flatten()(embed_movies)

    input_users = keras.layers.Input(shape=[1])
    embed_users = keras.layers.Embedding(nuser_id+1, 15)(input_users)
    users_out = keras.layers.Flatten()(embed_users)

    bias = keras.initializers.Constant(output_bias)
    
    conc_layer = keras.layers.Concatenate()([movies_out, users_out])
    
    dense1 = keras.layers.Dense(128,activation='relu', kernel_regularizer='l2')(conc_layer)
    
    dropout1 = keras.layers.Dropout(rate=.6, seed=42)(dense1)
    
    dense2 = keras.layers.Dense(64,activation='relu', kernel_regularizer='l2')(dropout1)
    
    dropout2 = keras.layers.Dropout(rate=.5, seed=42)(dense2)
    
    dense3 = keras.layers.Dense(32, activation='relu', kernel_regularizer='l2')(dropout2)
    
    dropout3 = keras.layers.Dropout(rate=.4, seed=42)(dense3)
    
    dense4 = keras.layers.Dense(16, activation='relu', kernel_regularizer='l2')(dropout3)
    
    x_out = keras.layers.Dense(1, activation='relu', kernel_regularizer='l2', bias_initializer=bias)(dense3)

    model = keras.Model([input_movies, input_users], x_out)
    
    opt = tf.optimizers.Adam(learning_rate=.01, clipnorm=1.)
    model.compile(optimizer=opt, loss='mean_squared_error', metrics = keras.metrics.RootMeanSquaredError())


#     print(hist.history['val_mean_squared_error'])
    return model

In [24]:
callback = keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',
                                        min_delta=.001,
                                        patience=30,
                                        mode="auto",
                                        restore_best_weights=True)

In [25]:
model = train_test_model(output_bias=7)

In [26]:
output = model.fit([X_train.movie, X_train.user], 
             X_train.rating_score,  
             batch_size=int(len(X_train)/10), 
                   epochs=100, validation_split=.2,
                  callbacks=[callback])

Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 

In [None]:
callback.best

In [None]:
callback.monitor

In [None]:
callback.model

In [None]:
output2 = model.fit([X_train.movie, X_train.user], 
             X_train.rating_score,  
             batch_size=len(X_train), epochs=10, validation_split=.2,
                  callbacks=[callback])

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(10, 3))


axs[0].plot(output.history['loss'], color='r')
axs[0].plot(output.history['val_loss'], color='b')

axs[1].plot(output.history['root_mean_squared_error'], color='r')
axs[1].plot(output.history['val_root_mean_squared_error'], color='b')
axs[1].axvline(np.where(output.history['val_root_mean_squared_error']==np.min(output.history['val_root_mean_squared_error'])), 
                 color='k', lw=1)
axs[1].annotate(str(np.min(output.history['val_root_mean_squared_error'])),
                    xy=(40,3),)

#     title_str = '-'.join(str(run_history[key]['params'][param]) for param in run_history[key]['params'].keys(),
#                         xytext=(.8,.8), xycoords='subfigure fraction')

axs[0].set_title("mse")
axs[1].set_title("rmse")

axs[0].set_ylim(0,10)
axs[1].set_ylim(0,5)

axs[1].axhline(1, color='k', lw=1)
    
plt.tight_layout()   
plt.show()

In [None]:
len(output.history['loss'])

In [None]:
model.trainable_variables

In [None]:
!mkdir -p saved_model
model.save('saved_model/rmse_839')

In [None]:
model.save('saved_model/rmse_839/model.h5')

In [None]:
pd.to_pickle(X_train, 'saved_model/rmse_839/X_train.pkl')
pd.to_pickle(X_test, 'saved_model/rmse_839/X_test.pkl')

In [None]:
X_train.groupby('rating_score').count()

In [None]:
import random
from itertools import chain
def flatten(list_of_lists):
    "Flatten one level of nesting"
    return chain.from_iterable(list_of_lists)

In [None]:
X_train

In [None]:
X_train.groupby('user').count().sort_values(by='movie',ascending=False).head()

In [None]:
user = 58315
ratings[ratings.user==user]

In [None]:
movie_array = ratings[ratings.user==user].movie.values.astype(int)
user_array = np.array([int(user) for n in range(len(movie_array))])

In [None]:
pred = model.predict([movie_array, user_array])

In [None]:
def is_close_enough(pred, ratings):
    val = []
    for pair in zip(pred, ratings):
        if pair[1] == 5:
            val.append(pair[1] - 1 <= pair[0])
        else:
            val.append(pair[1] - .5 <= pair[0] <= pair[1] + .5)
        
    return val

In [None]:
val = is_close_enough(np.round(pred.flatten(), 1), ratings[ratings.user==user].rating_score.values)

In [None]:
sum(val) / len(pred.flatten())

In [None]:
sns.histplot(pred.flatten(), bins=4, color='red', fill=False)
sns.histplot(X_train[X_train.user==user].rating_score.values, bins=5, color='blue', fill=False)

plt.show()

In [None]:
top_20_users = X_train.groupby('user').count().sort_values(by='movie',ascending=False).head(20).index

In [None]:
predictions = []

for user in top_20_users:
    movie_array = X_train[X_train.user==user].movie.values.astype(int)
    user_array = np.array([int(user) for n in range(len(movie_array))])
    pred = model.predict([movie_array, user_array])
    val = is_close_enough(np.round(pred.flatten(), 1), ratings[ratings.user==user].rating_score.values)
    
    predictions.append(list(pred.flatten()))
    print(user, sum(val) / len(pred.flatten()))

In [None]:
sns.histplot(list(flatten(predictions)), bins=5, color='red', fill=False)
sns.histplot(X_train[X_train.user.isin(top_20_users)].rating_score.values, bins=5, color='blue', fill=False)

plt.show()

In [None]:
import random

In [None]:
random_20_users = random.choices(X_train.user.unique(), k=20)

In [None]:
predictions = []

for user in random_20_users:
    movie_array = X_train[X_train.user==user].movie.values.astype(int)
    user_array = np.array([int(user) for n in range(len(movie_array))])
    pred = model.predict([movie_array, user_array])
    val = is_close_enough(np.round(pred.flatten(), 1), ratings[ratings.user==user].rating_score.values)
    
    predictions.append(list(pred.flatten()))
    print(user, sum(val) / len(pred.flatten()))

In [None]:
sns.histplot(list(flatten(predictions)), bins=5, color='red', fill=False,
            stat='density', common_norm=False)


sns.histplot(X_train[X_train.user.isin(top_20_users)].rating_score.values, bins=5, color='blue', fill=False,
            stat='density', common_norm=False)

plt.show()