In [1]:
!pip install -q tensorflow-recommenders

[?25l[K     |████▏                           | 10kB 21.5MB/s eta 0:00:01[K     |████████▍                       | 20kB 16.9MB/s eta 0:00:01[K     |████████████▌                   | 30kB 17.0MB/s eta 0:00:01[K     |████████████████▊               | 40kB 15.0MB/s eta 0:00:01[K     |█████████████████████           | 51kB 9.1MB/s eta 0:00:01[K     |█████████████████████████       | 61kB 8.3MB/s eta 0:00:01[K     |█████████████████████████████▎  | 71kB 9.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.1MB/s 
[?25h

In [2]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as L

import tensorflow_recommenders as tfrs

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('/content/drive/My Drive/colab/data/steam-200k.csv', header=None).rename({0: "member_id", 
                                                                                             1: "title",
                                                                                             2: "action",
                                                                                             3: "label"}, axis=1)
data.head()

Unnamed: 0,member_id,title,action,label,4
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [5]:
purchase_data = data[data["action"] == 'purchase'][["member_id","title"]].drop_duplicates().astype("string")
purchase_data = purchase_data

purchase_data_tmp = purchase_data.copy()
purchase_data_tmp["purchased"] = 1.0

play_data = data[data["action"] == 'play'][["member_id","title", "label"]].drop_duplicates().astype("string").rename({"label": "play_duration"}, axis=1)
play_data["play_duration"] = play_data.play_duration.astype("float32")

new_data = play_data.merge(purchase_data_tmp, how="outer", on=["member_id", "title"]).fillna(0.0)
new_data

Unnamed: 0,member_id,title,play_duration,purchased
0,151603712,The Elder Scrolls V Skyrim,273.0,1.0
1,151603712,Fallout 4,87.0,1.0
2,151603712,Spore,14.9,1.0
3,151603712,Fallout New Vegas,12.1,1.0
4,151603712,Left 4 Dead 2,8.9,1.0
...,...,...,...,...
128811,99096740,The Elder Scrolls V Skyrim - Hearthfire,0.0,1.0
128812,176449171,Counter-Strike,0.0,1.0
128813,176449171,Counter-Strike Condition Zero,0.0,1.0
128814,176449171,Counter-Strike Condition Zero Deleted Scenes,0.0,1.0


In [6]:
tf.constant(purchase_data.title.unique())

<tf.Tensor: shape=(5155,), dtype=string, numpy=
array([b'The Elder Scrolls V Skyrim', b'Fallout 4', b'Spore', ...,
       b'Space Colony', b'Life is Hard', b'Executive Assault'],
      dtype=object)>

In [7]:
members = tf.data.Dataset.from_tensors(tf.constant(purchase_data.member_id.unique()))
movies = tf.data.Dataset.from_tensors(tf.constant(purchase_data.title.unique()))

ratings = tf.data.Dataset.from_tensor_slices((tf.cast(purchase_data.member_id.values, tf.string), tf.cast(purchase_data.title.values, tf.string))).map(lambda x1,x2: {"movie_title": x1,
    "user_id": x2
}).shuffle(buffer_size=200000)

train_ratings = ratings.take(100000).batch(4000)
test_ratings = ratings.skip(100000).batch(8000)

In [8]:
EMBEDDING_SIZE = 16
MAX_TOKENS = 10_000

In [9]:
member_vocabulary = L.experimental.preprocessing.StringLookup()
member_vocabulary.adapt(members)

movie_titles_vocabulary = L.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)

In [10]:
class MovieModel(tf.keras.Model):

  def __init__(self, max_tokens=MAX_TOKENS):
    super().__init__()

    self.title_embedding = tf.keras.Sequential([
        movie_titles_vocabulary,
        tf.keras.layers.Embedding(movie_titles_vocabulary.vocab_size(), EMBEDDING_SIZE)
    ])
    self.title_text_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, EMBEDDING_SIZE, mask_zero=True),
      # We average the embedding of individual words to get one embedding vector
      # per title.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["movie_title"]),
        self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [11]:
# Define user and movie models.
movie_model = MovieModel()
movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"]))

user_model = tf.keras.Sequential([
    member_vocabulary,
    tf.keras.layers.Embedding(member_vocabulary.vocab_size(), EMBEDDING_SIZE*2)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.map(lambda x: {"movie_title": x}).map(movie_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [12]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model, task):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features)

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [13]:
# Create a retrieval model.
model = MovielensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))

In [None]:
# Train for 3 epochs.
model.fit(train_ratings, 
          epochs=50, 
          validation_data=test_ratings)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

In [None]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.map(lambda x: {"movie_title": x}).map(model.movie_model), movies)

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")