In [1]:
!pip install -q tensorflow-recommenders

In [2]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as L

import tensorflow_recommenders as tfrs

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This steam dataset was obtained from kaggle

https://www.kaggle.com/tamber/steam-video-games/version/1

In [4]:
raw_data = pd.read_csv('/content/drive/My Drive/colab/data/steam-200k.csv', header=None).rename({0: "user_id", 
                                                                                             1: "title",
                                                                                             2: "action",
                                                                                             3: "label"}, axis=1)
raw_data.head()

Unnamed: 0,user_id,title,action,label,4
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [5]:
play_data = raw_data[raw_data["action"] == 'play'][["user_id","title", "label"]].drop_duplicates().astype("string").rename({"label": "play_duration"}, axis=1)
play_data["play_duration"] = play_data.play_duration.astype(np.float32)

data = play_data.fillna(0.0)
data

Unnamed: 0,user_id,title,play_duration
1,151603712,The Elder Scrolls V Skyrim,273.0
3,151603712,Fallout 4,87.0
5,151603712,Spore,14.9
7,151603712,Fallout New Vegas,12.1
9,151603712,Left 4 Dead 2,8.9
...,...,...,...
199991,128470551,Fallen Earth,2.4
199993,128470551,Magic Duels,2.2
199995,128470551,Titan Souls,1.5
199997,128470551,Grand Theft Auto Vice City,1.5


In [6]:
data.dtypes

user_id           string
title             string
play_duration    float32
dtype: object

In [7]:
data[(data["play_duration"]== 0.0)].title.value_counts().head(5)

Series([], Name: title, dtype: Int64)

In [29]:
members = tf.data.Dataset.from_tensors(tf.constant(data.user_id.unique()))
movies = tf.data.Dataset.from_tensors(tf.constant(data.title.unique()))

ratings = (tf.data.Dataset
             .from_tensor_slices((tf.cast(data.user_id.values, tf.string), 
                                  tf.cast(data.title.values, tf.string),
                                  tf.cast(data.play_duration.values, tf.float16)))
              .map(lambda x1,x2,x3: {
                                  "user_id": x1,
                                  "movie_title": x2,
                                  "play_duration": x3
                                }
                   )
              .shuffle(buffer_size=200000))

train_ratings = ratings.take(60000).batch(8000)
test_ratings = ratings.skip(60000).batch(5000)

for row in test_ratings.batch(1).take(1):
  print(row)

{'user_id': <tf.Tensor: shape=(1, 5000), dtype=string, numpy=
array([[b'163432200', b'101695880', b'77214425', ..., b'39361297',
        b'94851051', b'105384518']], dtype=object)>, 'movie_title': <tf.Tensor: shape=(1, 5000), dtype=string, numpy=
array([[b'Creativerse', b'Robocraft', b'Crysis 2 Maximum Edition', ...,
        b'Cannons Lasers Rockets', b'H1Z1', b'Mars War Logs']],
      dtype=object)>, 'play_duration': <tf.Tensor: shape=(1, 5000), dtype=float16, numpy=array([[36. ,  0.1, 29. , ...,  0.1, 10.3,  1. ]], dtype=float16)>}


In [30]:
EMBEDDING_SIZE = 16
MAX_TOKENS = 10_000

In [31]:
member_vocabulary = L.experimental.preprocessing.StringLookup()
member_vocabulary.adapt(members)

movie_titles_vocabulary = L.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [32]:
class MovieModel(tf.keras.Model):

  def __init__(self, max_tokens=MAX_TOKENS):
    super().__init__()

    self.title_embedding = tf.keras.Sequential([
        movie_titles_vocabulary,
        tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), EMBEDDING_SIZE)
    ])
    self.title_text_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, EMBEDDING_SIZE, mask_zero=True),
      # We average the embedding of individual words to get one embedding vector
      # per title.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["movie_title"]),
        self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [33]:
# # Define user and movie models.
# movie_model = MovieModel()
# movie_model.title_text_embedding.layers[0].adapt(movies)

movie_model = tf.keras.Sequential([
  movie_titles_vocabulary,
  tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size() + 1, EMBEDDING_SIZE)
])

user_model = tf.keras.Sequential([
    member_vocabulary,
    tf.keras.layers.Embedding(member_vocabulary.vocab_size(), EMBEDDING_SIZE)
])

In [34]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model, rating_weight: float=0.5, retrieval_weight: float=0.5):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )

    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["movie_title"])

    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("play_duration")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [35]:
# Create a retrieval model.
model = MovielensModel(user_model, movie_model, rating_weight=0.3, retrieval_weight=0.7)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [36]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

monitor_metric = "val_total_loss"
checkpoint_filepath = "'/content/drive/My Drive/colab/model/tf-rec-sys-steam-multi-task-retrieval"

reduce_lr = ReduceLROnPlateau(monitor=monitor_metric, factor=0.1, verbose=1,
                                 patience=2)
early_stop = EarlyStopping(monitor=monitor_metric, patience=4,
                             verbose=1)
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor=monitor_metric,
    mode='max',
    save_best_only=True)

callbacks = [reduce_lr, early_stop, model_checkpoint_callback]

In [37]:
# Train for 3 epochs.
model.fit(train_ratings, epochs=100, 
          validation_data=test_ratings,
          callbacks=callbacks)

#The model weights (that are considered the best) are loaded into the model.
model.load_weights(checkpoint_filepath)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.010000000149011612.
Epoch 10/100
Epoch 11/100

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 00011: early stopping


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbb678381d0>

In [41]:
metrics = model.evaluate(test_ratings, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Retrieval top-100 accuracy: 0.287.
Ranking RMSE: 204.142.


In [47]:
lookup_user_id = data.sample(n=1).user_id.values[0]
data[data["user_id"] == lookup_user_id].sort_values("play_duration", ascending=False).head(7)

Unnamed: 0,user_id,title,play_duration
119034,48798067,Mount & Blade Warband,3178.0
119036,48798067,Counter-Strike Global Offensive,1302.0
119038,48798067,Counter-Strike Source,795.0
119040,48798067,War Thunder,249.0
119042,48798067,Napoleon Total War,174.0
119044,48798067,Europa Universalis IV,136.0
119046,48798067,Call of Duty Modern Warfare 2 - Multiplayer,123.0


In [48]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.map(lambda x: {"movie_title": x}).map(model.movie_model), movies)

# Get some recommendations.
_, titles = index(np.array(["1"])) #lookup_user_id]))
print(f"Top 5 recommendations for user {lookup_user_id}")
print(titles[0, :5])

Top 5 recommendations for user 48798067
tf.Tensor(
[b'Torchlight II' b'Sanctum 2' b'Tomb Raider' b'PAYDAY The Heist'
 b'Rogue Legacy'], shape=(5,), dtype=string)
