In [None]:
!pip install tensorflow_recommenders

In [9]:
from typing import Dict, Text
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [10]:
movies_df = pd.read_csv("/content/movies.csv")
ratings_df = pd.read_csv("/content/ratings.csv", dtype={"userId":str})

In [11]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  object 
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 3.1+ MB


In [4]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
merge_df = ratings_df.merge(movies_df, on="movieId", how="left")
merge_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### Selecting the required parameters for our model: 
- For the user we are considering the userId and timestamp for clustering
- For the movie we are considering the movie title and the context of the title

## Creating query model for user data pre-processing

In [13]:
class UserModel(tf.keras.Model):

  def __init__(self, use_timestamp, unique_user_ids=None, bucketised_timestamp=None, timestamp=None):
    super().__init__()
    self.use_timestamp = use_timestamp
    self.user_embeddings = tf.keras.Sequential(
        # layer to convert user string data to integer
        [tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
         # layer to convert integer values to embeddings, of size (len(unique_user_ids)+1, 32)
         tf.keras.layers.Embedding(len(unique_user_ids)+1, 32)])
    
    if use_timestamp:
      self.timestamp_embeddings = tf.keras.Sequential(
          [tf.keras.layers.Discretization(bucketised_timestamp),
           tf.keras.layers.Embedding(len(bucketised_timestamp)+1, 32)])
      self.normalised_timestamp = tf.keras.layers.Normalization(axis=None)

      self.normalised_timestamp.adapt(timestamp)
    
  def call(self, inputs):
    if not self.use_timestamp:
      return self.user_embeddings(inputs["userId"])
    else:
      return tf.concat([self.user_embeddings(inputs["userId"]), 
                       self.timestamp_embeddings(inputs["timestamp"]), 
                       tf.reshape(self.normalised_timestamp(inputs["timestamp"]),(-1,1))], axis=1)

Converting timestamp to tensor dataset

In [14]:
test = dict(merge_df[["timestamp"]])
test_tf = tf.data.Dataset.from_tensor_slices(test)

The tensor value
```
{'timestamp': 964982703}
```
when a batch of 10 is created the single value is converted into a batch containing 10 values from the rest of the data itself

```
tf.Tensor(
[964982703 964981247 964982224 964983815 964982931 964982400 964980868
 964982176 964984041 964984100], shape=(10,), dtype=int64)
 ```

In [15]:
# converts the entire datset to batches of 100 and stores it as a single value in a list
timestamps = np.concatenate(list(test_tf.map(lambda x: x["timestamp"]).batch(100)))

In [16]:
max_time = timestamps.max()
min_time = timestamps.min()

In [17]:
max_time, min_time

(1537799250, 828124615)

In [38]:
# Equidistant time points are created so that it helps us to create buckets for the constinuous time given
time_bucket = np.linspace(min_time, max_time, num=1000)
time_bucket.sort()

Converting the rating and movie title data to tensors as well. The steps followed for them is as follows:

1. Convert movie title data and userid data to equivalent tensors
2. Take the unique users and movie titles from the given data 

In [39]:
movie_title_tf = tf.data.Dataset.from_tensor_slices(dict(merge_df[["title"]]))
movie_title_tf_mapped = movie_title_tf.map(lambda x: x["title"])

In [40]:
x = merge_df[["title","userId","timestamp"]]
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      100836 non-null  object
 1   userId     100836 non-null  object
 2   timestamp  100836 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.1+ MB


In [41]:
rating_info_tf = tf.data.Dataset.from_tensor_slices(dict(merge_df[["title","userId","timestamp"]]))

Finding unique movies and users from the entire data

In [42]:
# unique_movie_titles = np.unique(np.concatenate(list(movie_title_tf_mapped.batch(1000))))
unique_movies = merge_df["title"].unique()

In [43]:
# unique_users = np.unique(np.concatenate(list(rating_info_tf.batch(1_000).map(
#     lambda x: x["userId"]))))
unique_users = merge_df["userId"].unique()

In [24]:
c = 0
for x in movie_title_tf_mapped:
  print(x)
  c+=1
  if c==2:
    break

tf.Tensor(b'Toy Story (1995)', shape=(), dtype=string)
tf.Tensor(b'Grumpier Old Men (1995)', shape=(), dtype=string)


In [44]:
movie_title_tf

<TensorSliceDataset element_spec={'title': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [45]:
class MovieModel(tf.keras.Model):
  def __init__(self, unique_movies, movies_names_tf):
    """
    unique_movies: (numpy array) - numpy array containing unique names
    movies_names_tf: (mapped_tensor) - mapped tensorflow tensor having all the movie names
    """
    super().__init__()
    max_tokens = 10_000
    self.movie_embedding = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary=unique_movies, mask_token=None),
                                                tf.keras.layers.Embedding(len(unique_movies)+1, 32)])
    self.text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)

    self.movie_title_context_embedding = tf.keras.Sequential([
                                                self.text_vectorizer,
                                                tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                                                tf.keras.layers.GlobalAveragePooling1D()
                                                ])
    self.text_vectorizer.adapt(movies_names_tf)
  
  def call(self, titles):
    return tf.concat([self.movie_embedding(titles),
                      self.movie_title_context_embedding(titles)],
                     axis=1)

In [49]:
class MovieRecommendModel(tfrs.models.Model):

  def __init__(self, user_model, movie_model, movies):
    """
    user_model: Object of the class user model
    movie_model: Object of the class movie model
    movies: Tensorflow tensor having movie title data
    """
    super().__init__()
    self.query_model = tf.keras.Sequential([user_model, tf.keras.layers.Dense(32)])
    self.candidate_model = tf.keras.Sequential([movie_model, tf.keras.layers.Dense(32)])
    
    # Tasks are the metrics defined according to which the recommendation is made
    self.retrieval_task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(candidates=movies.batch(128).map(self.candidate_model)))

  def call(self, features):
    user_embeddings = self.query_model({
        "userId": features["userId"],
        "timestamp": features["timestamp"],
    })

    movie_embeddings = self.candidate_model(features["title"])

    return (user_embeddings, movie_embeddings, self.ranking_tasks([user_embeddings, movie_embeddings], axis=1))

  def compute_loss(self, features, training=False):
    user_embeddings = self.query_model({
        "userId": features["userId"],
        "timestamp": features["timestamp"],
    })

    movie_embeddings = self.candidate_model(features["title"])



    return self.retrieval_task(user_embeddings, movie_embeddings)

In [50]:
tf.random.set_seed(42)
shuffled = rating_info_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

In [51]:
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [52]:
user_model_obj = UserModel(use_timestamp=True, unique_user_ids=unique_users, bucketised_timestamp=time_bucket, timestamp=timestamps)


In [33]:
movie_model_obj = MovieModel(unique_movies=unique_movies, movies_names_tf=movie_title_tf_mapped)

In [53]:
model = MovieRecommendModel(user_model_obj, movie_model_obj, movie_title_tf_mapped)

In [54]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [55]:
cached_train

<BatchDataset element_spec={'title': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'userId': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(None,), dtype=tf.int64, name=None)}>

In [56]:
model.fit(cached_train, epochs=1)





<keras.callbacks.History at 0x7f87bddf3f50>

In [46]:
model.evaluate(cached_test, return_dict=True)

> <ipython-input-23-485ac8022669>(18)compute_loss()
-> user_embeddings = self.query_model({
(Pdb) c






{'factorized_top_k/top_1_categorical_accuracy': 0.0006000000284984708,
 'factorized_top_k/top_5_categorical_accuracy': 0.0008500000112690032,
 'factorized_top_k/top_10_categorical_accuracy': 0.0009500000160187483,
 'factorized_top_k/top_50_categorical_accuracy': 0.002850000048056245,
 'factorized_top_k/top_100_categorical_accuracy': 0.00494999997317791,
 'loss': 28231.33203125,
 'regularization_loss': 0,
 'total_loss': 28231.33203125}

In [47]:
train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]



In [57]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(tf.data.Dataset.zip((movie_title_tf_mapped.batch(100), movie_title_tf_mapped.batch(100).map(model.candidate_model))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f87bbd07350>

In [65]:
_, titles = index({"userId":tf.constant(["221"]),
                   "timestamp":tf.constant([1981])})
print(f"Recommendations for user 112: {titles[0, :100]}")



Recommendations for user 112: [b'Bogus (1996)' b'Bogus (1996)' b'Foxfire (1996)' b'Foxfire (1996)'
 b'Foxfire (1996)' b'Bed of Roses (1996)' b'Bed of Roses (1996)'
 b'Bed of Roses (1996)' b'Bed of Roses (1996)' b'Bed of Roses (1996)']
