In [47]:
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [48]:
import pandas as pd
movie_df = pd.read_csv("/content/movies.csv")
ratings_df = pd.read_csv("/content/ratings.csv")

In [49]:
merged_df.columns

Index(['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp'], dtype='object')

In [50]:
merged_df = movie_df.merge(ratings_df, on=["movieId"])
ratings = np.array(merged_df[["userId","title"]].values.tolist())

In [51]:
ratings

array([['1', 'Toy Story (1995)'],
       ['5', 'Toy Story (1995)'],
       ['7', 'Toy Story (1995)'],
       ...,
       ['184', 'Flint (2017)'],
       ['184', 'Bungo Stray Dogs: Dead Apple (2018)'],
       ['331', 'Andrew Dice Clay: Dice Rules (1991)']], dtype='<U158')

In [52]:
ratings = tf.data.Dataset.from_tensor_slices(ratings)
ratings

<TensorSliceDataset element_spec=TensorSpec(shape=(2,), dtype=tf.string, name=None)>

In [53]:
a = list(ratings)
a[0]

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'1', b'Toy Story (1995)'], dtype=object)>

In [54]:
ratings = ratings.map(lambda x: {
    "title": x[0],
    "userId": x[1],
})

In [55]:
movies = np.array(merged_df[["userId","title"]].values.tolist())
movies = tf.data.Dataset.from_tensor_slices(movies)

In [56]:
movies = movies.map(lambda x: x[1])

In [57]:
list(movies)[0]

<tf.Tensor: shape=(), dtype=string, numpy=b'Toy Story (1995)'>

In [58]:
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000_000).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_movie_titles[:10]

array([b"'71 (2014)", b"'Hellboy': The Seeds of Creation (2004)",
       b"'Round Midnight (1986)", b"'Salem's Lot (2004)",
       b"'Til There Was You (1997)", b"'Tis the Season for Love (2015)",
       b"'burbs, The (1989)", b"'night Mother (1986)",
       b'(500) Days of Summer (2009)', b'*batteries not included (1987)'],
      dtype=object)

In [59]:
embedding_dimension = 32
user_model = tf.keras.Sequential(tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=unique_user_ids, mask_token=None),
tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dimension))
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])
movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(movie_model)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [60]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.movie_model: tf.keras.Model = movie_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["userId"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_movie_embeddings = self.movie_model(features["title"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_movie_embeddings)

In [61]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [62]:
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [64]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe2f2ce1d50>

In [65]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 0.0016499999910593033,
 'factorized_top_k/top_100_categorical_accuracy': 0.0016499999910593033,
 'loss': 29626.31640625,
 'regularization_loss': 0,
 'total_loss': 29626.31640625}

In [66]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fe2ece8c5d0>

In [69]:
_, titles = index(tf.constant(["89"]))
print(f"Recommendations for user 110: {titles[0, :5]}")

Recommendations for user 110: [b'Tears for Sale (2008)' b'Bananas (1971)' b'Bananas (1971)'
 b'Bananas (1971)' b'Bananas (1971)']
