In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [2]:
# dane pobrane z https://www.kaggle.com/datasets/jvanelteren/boardgamegeek-reviews?resource=download&select=bgg-15m-reviews.csv
processed = pd.read_csv('reviews_processed.csv')


In [3]:
ratings = processed[['user', 'rating', 'ID']]
games = processed[['ID', 'name']]


In [4]:
games_dropped = games.drop_duplicates()

In [5]:
games_ds = tf.data.Dataset.from_tensor_slices(dict(games_dropped))

In [6]:
ratings = ratings.dropna()

In [8]:
processed = processed.dropna()

In [9]:
ratings_ds = tf.data.Dataset.from_tensor_slices(dict(processed))

In [10]:
ratings_ds = ratings_ds.map(lambda x: {
    "game_id" : x['ID'],
    "game_name" : x['name'],
    "user_name" : x['user'],
    "user_rating" : x['rating']
})

In [11]:
games_ds = games_ds.map(lambda x: {
    "game_name": x['name']
})


In [12]:
tf.random.set_seed(42)
shuffled = ratings_ds.shuffle(10_000_000, seed=42, reshuffle_each_iteration=False)



In [13]:
train = shuffled.take(8_000_000)
test = shuffled.skip(8_000_000).take(2_000_000)

In [14]:
game_titles = games_ds.batch(1000).map(lambda x: x['game_name'])
user_names = ratings_ds.batch(1_000_000).map(lambda x: x['user_name'])

In [15]:
games_ds_tensors = games_ds.map(lambda x: x['game_name'])

In [16]:
unique_users = np.unique(np.concatenate(list(user_names)))

In [18]:
unique_games = np.unique(np.concatenate(list(game_titles)))

In [20]:
class GamesModel(tfrs.models.Model):
    
    def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
        
        super().__init__()
        
        embedding_dimension = 32
        
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_users, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_users) + 1, embedding_dimension)
        ])

        self.game_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_games, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_games) + 1, embedding_dimension)
        ])
        
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )

        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=games_ds_tensors.batch(128).map(self.game_model)
            )
        )

        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight
        
    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        user_embeddings = self.user_model(features['user_name'])
        game_embeddings = self.game_model(features['game_name'])
        
        return(
            user_embeddings,
            game_embeddings,
            self.rating_model(
                tf.concat([user_embeddings, game_embeddings], axis=1)
            ),
        )
        
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        
        ratings = features.pop('user_rating')
        
        user_embeddings, game_embeddings, rating_predictions = self(features)
        
        rating_loss = self.rating_task(
            labels = ratings,
            predictions = rating_predictions,
        )
        
        retrieval_loss = self.retrieval_task(user_embeddings, game_embeddings)
        
        return (self.rating_weight * rating_loss + self.retrieval_weight * retrieval_loss)

In [21]:
model = GamesModel(rating_weight=1.0, retrieval_weight=0.0)
model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))


In [22]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [27]:
model.fit(cached_train, epochs=3, verbose=1)

Epoch 1/3


In [26]:
bruteforce = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
bruteforce.index_from_dataset(
    games_ds_tensors.batch(128).map(lambda name: (name, model.game_model(name)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f7b4d52abb0>

In [31]:
tf.saved_model.save(bruteforce, "./bruteforce_saved", options=tf.saved_model.SaveOptions(namespace_whitelist=["BruteForce"]))



INFO:tensorflow:Assets written to: ./bruteforce_saved/assets


INFO:tensorflow:Assets written to: ./bruteforce_saved/assets
