In [1]:
import datetime
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import sparse
from typing import List

In [2]:
print(f"Tensorflow version: {tf.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Tensorflow version: 2.10.0
Pandas version: 2.0.3
Numpy version: 1.23.5


In [3]:
ratings = pd.read_csv('data/full_ratings.csv', low_memory=False)
print("Wgrano dane do pamięci")

Wgrano dane do pamięci


In [4]:
ratings

Unnamed: 0,userId,movieId,rating
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,0
...,...,...,...
534782,670,792,0
534783,670,793,0
534784,670,794,0
534785,670,795,0


In [5]:
n_users = len(ratings['userId'].unique())
n_items = len(ratings['movieId'].unique())
print("number of unique users: ", n_users)
print("number of unique items: ", n_items)

number of unique users:  671
number of unique items:  797


In [30]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Flatten, Dot, Add
import os
import datetime


def create_mf_model(num_users, num_items, embedding_size, learning_rate):
    # Wejścia dla użytkowników i filmów
    user_input = Input(shape=(1,), name='userId')
    item_input = Input(shape=(1,), name='movieId')

    # Warstwa osadzania dla użytkowników i filmów
    user_embedding = Embedding(num_users, embedding_size, embeddings_initializer='he_normal', name='user_embedding')(user_input)
    item_embedding = Embedding(num_items, embedding_size, embeddings_initializer='he_normal', name='item_embedding')(item_input)


    # Spłaszczenie wektorów osadzania
    user_vec = Flatten()(user_embedding)
    item_vec = Flatten()(item_embedding)

    # Obliczenie iloczynu skalarnego wektorów użytkowników i filmów
    dot_user_item = Dot(axes=1)([user_vec, item_vec])

    # Warstwa osadzania dla biasów użytkowników i filmów
    user_bias = Embedding(num_users, 1, name='user_bias')(user_input)
    item_bias = Embedding(num_items, 1, name='item_bias')(item_input)

    # Spłaszczenie wektorów biasów
    user_bias = Flatten()(user_bias)
    item_bias = Flatten()(item_bias)

    # Dodanie iloczynu skalarnego do biasów użytkowników i filmów
    add_bias = Add()([dot_user_item, user_bias, item_bias])

    # Zbudowanie modelu
    mf_model = Model(inputs=[user_input, item_input], outputs=add_bias)
    
    # Ustawienie optymalizatora z określonym współczynnikiem uczenia
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    mf_model.compile(optimizer=optimizer, 
                     loss='mean_squared_error',
                     metrics=[
                         tf.keras.metrics.TruePositives(name="tp"),
                         tf.keras.metrics.FalsePositives(name="fp"),
                         tf.keras.metrics.TrueNegatives(name="tn"),
                         tf.keras.metrics.FalseNegatives(name="fn"),
                         tf.keras.metrics.BinaryAccuracy(name="accuracy"),
                         tf.keras.metrics.Precision(name="precision"),
                         tf.keras.metrics.Recall(name="recall"),
                         tf.keras.metrics.AUC(name="auc"),
                     ])
    return mf_model

### Parameters

In [55]:
embedding_size = 4
learning_rate = 0.001
num_epochs = 10
batch_size=16
val_split=0.2

In [56]:
def make_tf_dataset(data: pd.DataFrame, target: str, batch_size: int = 128, val_split: float = 0.2):
    # Convert the DataFrame to TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices(({
        'userId': data['userId'].values,
        'movieId': data['movieId'].values
    }, data[target].values))
    
    # Shuffle and split the dataset
    dataset = dataset.shuffle(buffer_size=len(data))
    train_size = int((1 - val_split) * len(data))
    
    train_dataset = dataset.take(train_size)
    val_dataset = dataset.skip(train_size)
    
    # Batch the datasets
    train_dataset = train_dataset.batch(batch_size)
    val_dataset = val_dataset.batch(batch_size)
    
    return train_dataset, val_dataset

m_train, m_val = make_tf_dataset(ratings, ["rating"], batch_size, val_split)

In [57]:
num_elements = tf.data.experimental.cardinality(m_train).numpy()
print(f"Liczba elementów w ds_train: {num_elements}")
num_elements_val = tf.data.experimental.cardinality(m_val).numpy()
print(f"Liczba elementów w ds_val: {num_elements_val}")

Liczba elementów w ds_train: 26740
Liczba elementów w ds_val: 6685


In [58]:
%time

mf_model = create_mf_model(n_users, n_items, embedding_size, learning_rate)

# Definiowanie logów i callbacków
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)

# Trening modelu
history = mf_model.fit(
    m_train,
    validation_data=m_val,
    epochs=num_epochs,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1,
)

# Ewaluacja modelu
evaluation = mf_model.evaluate(m_val)
print(f'Matrix Factorization Model Evaluation: {evaluation}')

CPU times: total: 0 ns
Wall time: 0 ns
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Matrix Factorization Model Evaluation: [0.05102419853210449, 1698.0, 674.0, 98445.0, 6141.0, 0.9362834095954895, 0.7158516049385071, 0.21660925447940826, 0.8653073906898499]


In [59]:
import os.path

mf_model.save('models/MF_model.h5')
print('saved')

saved
