In [1]:
import datetime
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import sparse
from typing import List

In [2]:
print(f"Tensorflow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Tensorflow version: 2.10.0
Pandas version: 2.0.3
Numpy version: 1.23.5


In [3]:
ratings = pd.read_csv('data/full_ratings.csv', low_memory=False)
print("Wgrano dane do pamięci")

Wgrano dane do pamięci


In [4]:
ratings

Unnamed: 0,userId,movieId,rating
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,0
...,...,...,...
534782,670,792,0
534783,670,793,0
534784,670,794,0
534785,670,795,0


In [5]:
n_users = len(ratings['userId'].unique())
n_items = len(ratings['movieId'].unique())
print("number of unique users: ", n_users)
print("number of unique items: ", n_items)

number of unique users:  671
number of unique items:  797


# The model (Neural Collaborative Filtering)

<center><img src="https://raw.githubusercontent.com/murilo-cunha/inteligencia-superficial/master/images/2020-09-11-neural_collaborative_filter/ncf_all_with_alpha.png" width="70%" url="https://developers.google.com/machine-learning/recommendation/collaborative/basics" description="Fonte: https://developers.google.com/machine-learning/recommendation/collaborative/basics" /> </center>

In [6]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2


def create_ncf_movies(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 16,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [16, 4],
    reg_layers: List[int] = [0.01, 0.01],
    activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int64", name="userId")
    item = Input(shape=(), dtype="int64", name="movieId")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

### Parameters

In [19]:
num_epochs = 20
learning_rate = 0.001
batch_size = 128
val_split = 0.1
latent_dim_mf = 16
latent_dim_mlp = 4
dense_layers = [8,8]
reg_layers = [0.01, 0.01]
activation_dense = "relu"
reg_mf: int = 0
reg_mlp: int = 0.01

In [20]:
from tensorflow.keras.optimizers import Adam

ncf_model_movies = create_ncf_movies(
    n_users,
    n_items,
    latent_dim_mf,
    latent_dim_mlp,
    reg_mf,
    reg_mlp,
    dense_layers,
    reg_layers,
    activation_dense
)

ncf_model_movies.compile(
    optimizer=Adam(learning_rate=learning_rate),
    loss='mean_squared_error',
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model_movies._name = "neural_collaborative_filtering"
ncf_model_movies.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 userId (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 movieId (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 4)           2684        ['userId[0][0]']                 
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 4)           3188        ['movieId[0][0]']                
                                                                     

In [21]:
def make_tf_dataset(data: pd.DataFrame, target: str, batch_size: int = 128, val_split: float = 0.2):
    # Convert the DataFrame to TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(({
        'userId': data['userId'].values,
        'movieId': data['movieId'].values
    }, data[target].values))
    
    # Shuffle and split the dataset
    dataset = dataset.shuffle(buffer_size=len(data))
    train_size = int((1 - val_split) * len(data))
    
    train_dataset = dataset.take(train_size)
    val_dataset = dataset.skip(train_size)
    
    # Batch the datasets
    train_dataset = train_dataset.batch(batch_size)
    val_dataset = val_dataset.batch(batch_size)
    
    return train_dataset, val_dataset

m_train, m_val = make_tf_dataset(ratings, ["rating"], batch_size, val_split)

In [22]:
num_elements = tf.data.experimental.cardinality(m_train).numpy()
print(f"Liczba elementów w ds_train: {num_elements}")
num_elements_val = tf.data.experimental.cardinality(m_val).numpy()
print(f"Liczba elementów w ds_val: {num_elements_val}")

Liczba elementów w ds_train: 3761
Liczba elementów w ds_val: 418


In [23]:
# define logs and callbacks
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=2
)

train_hist = ncf_model_movies.fit(
    m_train,
    validation_data=m_val,
    epochs=num_epochs,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1,
)

ncf_evaluation = ncf_model_movies.evaluate(m_val)
print(f'Neural Collaborative Filtering Model Evaluation: {ncf_evaluation}')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Neural Collaborative Filtering Model Evaluation: [0.03246220201253891, 2251.0, 390.0, 49218.0, 1620.0, 0.9624151587486267, 0.8523286581039429, 0.5815035104751587, 0.9140697717666626]


In [18]:
import os.path

ncf_model_movies.save('models/NeuMF_model.h5')
print('saved')

saved
