In [None]:
!pip install -q tensorflow-recommenders

In [2]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as L

import tensorflow_recommenders as tfrs

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


This steam dataset was obtained from kaggle

https://www.kaggle.com/tamber/steam-video-games/version/1

In [7]:
data = pd.read_csv('/content/drive/My Drive/colab/data/steam-200k.csv', header=None).rename({0: "user_id", 
                                                                                             1: "title",
                                                                                             2: "action",
                                                                                             3: "label"}, axis=1)
data.head()

Unnamed: 0,user_id,title,action,label,4
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [None]:
purchase_data = data[data["action"] == 'purchase'][["user_id","title"]].drop_duplicates().astype("string")
purchase_data

In [None]:
members = tf.data.Dataset.from_tensors(tf.constant(purchase_data.user_id.unique()))
movies = tf.data.Dataset.from_tensors(tf.constant(purchase_data.title.unique()))

ratings = (tf.data.Dataset
             .from_tensor_slices((tf.cast(purchase_data.user_id.values, tf.string), 
                                  tf.cast(purchase_data.title.values, tf.string)))
              .map(lambda x1,x2: {
                                  "user_id": x1,
                                  "movie_title": x2
                                }
                   )
              .shuffle(buffer_size=200000))

train_ratings = ratings.take(100000)
test_ratings = ratings.skip(100000).batch(8000)

for row in test_ratings.batch(1).take(1):
  print(row)

In [14]:
EMBEDDING_SIZE = 16
MAX_TOKENS = 10_000

In [15]:
member_vocabulary = L.experimental.preprocessing.StringLookup()
member_vocabulary.adapt(members)

movie_titles_vocabulary = L.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [16]:
class MovieModel(tf.keras.Model):

  def __init__(self, max_tokens=MAX_TOKENS):
    super().__init__()

    self.title_embedding = tf.keras.Sequential([
        movie_titles_vocabulary,
        tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), EMBEDDING_SIZE)
    ])
    self.title_text_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, EMBEDDING_SIZE, mask_zero=True),
      # We average the embedding of individual words to get one embedding vector
      # per title.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["movie_title"]),
        self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [17]:
# Define user and movie models.
movie_model = MovieModel()
movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"]))

user_model = tf.keras.Sequential([
    member_vocabulary,
    tf.keras.layers.Embedding(member_vocabulary.vocab_size(), EMBEDDING_SIZE*2)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.map(lambda x: {"movie_title": x}).map(movie_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [18]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model, task):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features)

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [19]:
# Create a retrieval model.
model = MovielensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))

In [20]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

monitor_metric = "val_total_loss"

reduce_lr = ReduceLROnPlateau(monitor=monitor_metric, factor=0.1, verbose=1,
                                 patience=2)
early_stop = EarlyStopping(monitor=monitor_metric, patience=4,
                             verbose=1)
callbacks = [reduce_lr, early_stop]

In [None]:
# Train for 3 epochs.
model.fit(train_ratings.batch(10000), 
          epochs=50, 
          validation_data=test_ratings,
          callbacks=callbacks)

In [34]:
lookup_user_id = "151603712"
purchase_data[purchase_data["user_id"] == lookup_user_id].head(5)

Unnamed: 0,user_id,title
0,151603712,The Elder Scrolls V Skyrim
2,151603712,Fallout 4
4,151603712,Spore
6,151603712,Fallout New Vegas
8,151603712,Left 4 Dead 2


In [35]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.map(lambda x: {"movie_title": x}).map(model.movie_model), movies)

# Get some recommendations.
_, titles = index(np.array([lookup_user_id]))
print(f"Top 3 recommendations for user {lookup_user_id}: {titles[0, :3]}")

Top 3 recommendations for user 42: [b'The Elder Scrolls V Skyrim - Dragonborn'
 b'The Elder Scrolls V Skyrim - Dawnguard'
 b'The Elder Scrolls V Skyrim - Hearthfire']
