In [1]:
!pip install -q tensorflow-recommenders

In [2]:
from typing import Dict, Text

import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as L

import tensorflow_recommenders as tfrs

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This steam dataset was obtained from kaggle

https://www.kaggle.com/tamber/steam-video-games/version/1

In [4]:
raw_data = pd.read_csv('/content/drive/My Drive/colab/data/steam-200k.csv', header=None).rename({0: "user_id", 
                                                                                             1: "title",
                                                                                             2: "action",
                                                                                             3: "label"}, axis=1)
raw_data.head()

Unnamed: 0,user_id,title,action,label,4
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [5]:
play_data = raw_data[raw_data["action"] == 'play'][["user_id","title", "label"]].drop_duplicates().astype("string").rename({"label": "play_duration"}, axis=1)
play_data["play_duration"] = play_data.play_duration.astype(np.float32)

data = play_data.fillna(0.0)
data

Unnamed: 0,user_id,title,play_duration
1,151603712,The Elder Scrolls V Skyrim,273.0
3,151603712,Fallout 4,87.0
5,151603712,Spore,14.9
7,151603712,Fallout New Vegas,12.1
9,151603712,Left 4 Dead 2,8.9
...,...,...,...
199991,128470551,Fallen Earth,2.4
199993,128470551,Magic Duels,2.2
199995,128470551,Titan Souls,1.5
199997,128470551,Grand Theft Auto Vice City,1.5


In [6]:
data.dtypes

user_id           string
title             string
play_duration    float32
dtype: object

In [7]:
data[(data["play_duration"]== 0.0)].title.value_counts().head(5)

Series([], Name: title, dtype: Int64)

In [9]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

In [11]:
members = tf.data.Dataset.from_tensors(tf.constant(data.user_id.unique())).map(lambda x: {"user_id": x})
movies = tf.data.Dataset.from_tensors(tf.constant(data.title.unique())).map(lambda x: {"movie_title": x})

ratings = (tf.data.Dataset
             .from_tensor_slices((tf.cast(data.user_id.values, tf.string), 
                                  tf.cast(data.title.values, tf.string),
                                  tf.cast(data.play_duration.values, tf.float16)))
              .map(lambda x1,x2,x3: {
                                  "user_id": x1,
                                  "movie_title": x2,
                                  "play_duration": x3
                                }
                   )
              .shuffle(buffer_size=200000))

train_ratings = ratings.take(60000).batch(8000)
test_ratings = ratings.skip(60000).batch(5000)

for row in test_ratings.batch(1).take(1):
  print(row)

{'user_id': <tf.Tensor: shape=(1, 5000), dtype=string, numpy=
array([[b'105877396', b'168881981', b'101725007', ..., b'11403772',
        b'129478920', b'118852041']], dtype=object)>, 'movie_title': <tf.Tensor: shape=(1, 5000), dtype=string, numpy=
array([[b"Sid Meier's Civilization III Complete",
        b'Counter-Strike Global Offensive', b'Magicka', ..., b'Firefall',
        b'Dota 2', b'H1Z1']], dtype=object)>, 'play_duration': <tf.Tensor: shape=(1, 5000), dtype=float16, numpy=
array([[1.280e+01, 1.515e+03, 3.999e-01, ..., 2.100e+00, 1.851e+03,
        1.430e+01]], dtype=float16)>}


In [12]:
EMBEDDING_SIZE = 16
MAX_TOKENS = 10_000

In [13]:
member_vocabulary = L.experimental.preprocessing.StringLookup()
member_vocabulary.adapt(members.map(lambda x: x["user_id"]))

movie_titles_vocabulary = L.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies.map(lambda x: x["movie_title"]))

print(member_vocabulary.get_vocabulary()[:5])
print(movie_titles_vocabulary.get_vocabulary()[:5])

['', '[UNK]', '99992274', '99961115', '99940330']
['[UNK]', 'theHunter Primal', 'theHunter', 'the static speaks my name', 'sZone-Online']


In [14]:
class MovieModel(tf.keras.Model):

  def __init__(self, max_tokens=MAX_TOKENS):
    super().__init__()

    self.title_embedding = tf.keras.Sequential([
        movie_titles_vocabulary,
        tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), EMBEDDING_SIZE)
    ])
    self.title_text_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, EMBEDDING_SIZE, mask_zero=True),
      # We average the embedding of individual words to get one embedding vector
      # per title.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["movie_title"]),
        self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [15]:
# # # Define user and movie models.
movie_model = MovieModel()
movie_model.title_text_embedding.layers[0].adapt(movies.map(lambda x: x["movie_title"]))

# movie_model = tf.keras.Sequential([
#   movie_titles_vocabulary,
#   tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size() + 1, EMBEDDING_SIZE)
# ])

user_model = tf.keras.Sequential([
    member_vocabulary,
    tf.keras.layers.Embedding(member_vocabulary.vocab_size(), EMBEDDING_SIZE * 2)
])

In [16]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model, rating_weight: float=0.5, retrieval_weight: float=0.5):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )

    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features)

    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("play_duration")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [17]:
# Create a retrieval model.
model = MovielensModel(user_model, movie_model, rating_weight=0.2, retrieval_weight=0.8)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [20]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard

monitor_metric = "val_total_loss"
checkpoint_filepath = "'/content/drive/My Drive/colab/model/tf-rec-sys-steam-multi-task-retrieval"

reduce_lr = ReduceLROnPlateau(monitor=monitor_metric, factor=0.1, verbose=1,
                                 patience=2)
early_stop = EarlyStopping(monitor=monitor_metric, patience=4,
                             verbose=1)
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor=monitor_metric,
    mode='max',
    save_best_only=True)

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(logdir)

callbacks = [reduce_lr, early_stop, model_checkpoint_callback, tensorboard]

In [21]:
# Train for 3 epochs.
model.fit(train_ratings, epochs=10, 
          validation_data=test_ratings,
          callbacks=callbacks)

#The model weights (that are considered the best) are loaded into the model.
model.load_weights(checkpoint_filepath)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.010000000149011612.
Epoch 7/10
Epoch 8/10

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 00008: early stopping


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7a6c05a490>

In [22]:
metrics = model.evaluate(test_ratings, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Retrieval top-100 accuracy: 0.424.
Ranking RMSE: 235.338.


In [23]:
lookup_user_id = data.sample(n=1).user_id.values[0]
data[data["user_id"] == lookup_user_id].sort_values("play_duration", ascending=False).head(7)

Unnamed: 0,user_id,title,play_duration
111469,121133260,Call of Duty Black Ops III,91.0
111471,121133260,Call of Duty Black Ops - Multiplayer,13.4
111473,121133260,The Elder Scrolls V Skyrim,7.9
111475,121133260,Call of Duty Black Ops,7.4
111477,121133260,Call of Duty World at War,5.4
111479,121133260,No More Room in Hell,0.3
111481,121133260,sZone-Online,0.2


In [35]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.map(model.movie_model), movies.map(lambda x: x["movie_title"]))

# Get some recommendations.
_, titles = index(np.array([lookup_user_id]))
print(f"Top 5 recommendations for user {lookup_user_id}")
print(titles[0, :5])

Top 5 recommendations for user 121133260
tf.Tensor(
[b'Call of Duty Black Ops' b'sZone-Online'
 b'Call of Duty Black Ops - Multiplayer' b'Warface' b'BLOCKADE 3D'], shape=(5,), dtype=string)
