the retrieval stage is responsible for selecting an initial set of hundreds of candidates from all possible candidates. The main objective of this model is to efficiently weed out all candidates that the user is not interested in. Because the retrieval model may be dealing with millions of candidates, it has to be computationally efficient.

etrieval models are often composed of two sub-models:

1. A query model computing the query representation (normally a fixed-dimensionality embedding vector) using query features.
2. A candidate model computing the candidate representation (an equally-sized vector) using the candidate features
The outputs of the two models are then multiplied together to give a query-candidate affinity score, with higher scores expressing a better match between the candidate and the query.

cabdidate representation will be created from:
1. movie title
2. movie genre 
user representation will be created from:
1. user age.
2. user occupation.
3. user gender.
4. time as a contextual feature

# importing necessary libreries

In [5]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets


[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[K     |████████████████████████████████| 462 kB 40.2 MB/s 
[K     |████████████████████████████████| 4.2 MB 13.7 MB/s 
[?25h

In [6]:
import os
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import pprint

# loading data from drive

In [8]:
movies=tf.data.experimental.load('/content/drive/MyDrive/datasets/movielens_movies')
ratings=tf.data.experimental.load('/content/drive/MyDrive/datasets/movielens_ratings')
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [9]:
for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [77]:
data=ratings.map(lambda x:{
    'age':x['raw_user_age'],
    'user_id':x['user_id'],
    'movie_id':x['movie_id'],
    'time':x['timestamp'],
    'gender':x['user_gender'],
    'occupation':x['user_occupation_text'],
    'movie_title':x['movie_title'],
    'genre':x['movie_genres'][0]
})
user_data=ratings.map(lambda x:{
    'age':x['raw_user_age'],
    'user_id':x['user_id'],
    'movie_id':x['movie_id'],
    'time':x['timestamp'],
    'gender':x['user_gender'],
    'occupation':x['user_occupation_text'],
    'movie_title':x['movie_title'],
    'genre':x['movie_genres'][0]
})
movie_data=movies.map(lambda x: {'movie_title':x['movie_title'],
                      'genre':x['movie_genres'][0]S})
movie_title=movies.map(lambda x:x['movie_title']
                     )
movie_id=movies.map(lambda x:x['movie_id']
                     )

In [12]:
for x in user_data.take(1).as_numpy_iterator():
  pprint.pprint(x)
print('-------------------------------')
for x in movie_data.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'age': 46.0,
 'gender': 1.0,
 'genre': 7.0,
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'occupation': b'doctor',
 'time': 879024300.0}
-------------------------------
{'genre': 4.0, 'movie_title': b'You So Crazy (1994)'}


# creating a vocabulary for movie title and user occupation

In [69]:
movie_titles=user_data.map(lambda x:x['movie_title'])
unique_movie_titles=np.unique(np.concatenate(list(movie_titles.batch(1_000))))
user_occupation=user_data.map(lambda x :x['occupation'])
unique_user_occupation=np.unique(np.concatenate(list(user_occupation.batch(1_000))))
movie_genres=user_data.map(lambda x :x['genre'])
unique_movie_genres=np.unique(np.concatenate(list(movie_genres.batch(1_000))))
user_gender=movie_genres=user_data.map(lambda x :x['gender'])
unique_user_gender=np.unique(np.concatenate(list(user_gender.batch(1_000))))

# creating a normalizer for time feature

In [14]:
timestamp=user_data.map(lambda x : x['time'])
min_timestamp=np.unique(np.concatenate(list(timestamp.batch(1_000)))).min()
max_timestamp=np.unique(np.concatenate(list(timestamp.batch(1_000)))).max()
time_bucket=np.linspace(min_timestamp,max_timestamp,1000)
ages=user_data.map(lambda x : x['age'])

# spliting data to train and test 

In [15]:
tf.random.set_seed(123)
shuffled=user_data.shuffle(1_000,reshuffle_each_iteration=False)

train=shuffled.take(80_000)
test=shuffled.skip(80_000).take(10_000)

# building model
Because we are building a two-tower retrieval model, we can build each tower separately and then combine them in the final model.

#  creating seprate models

In [44]:
embedding_dim=128
# age normalizer
age_normalizer=tf.keras.layers.Normalization(
    axis=None
)

# gender model
gender_model=tf.keras.Sequential(
    [tf.keras.layers.IntegerLookup(vocabulary=unique_user_gender),
     tf.keras.layers.Embedding(len(unique_user_gender)+1,2)
    ]
)
# genre model
genre_model=tf.keras.Sequential([tf.keras.layers.IntegerLookup(vocabulary=unique_movie_genres),
    tf.keras.layers.Embedding(len(unique_movie_genres)+1,embedding_dim)
     
    ]
)

# movie model
movie_model=tf.keras.Sequential(
    [tf.keras.layers.StringLookup(vocabulary=unique_movie_titles),
     tf.keras.layers.Embedding(len(unique_movie_titles)+1,embedding_dim)
    ]
)
# occupation model
occupation_model=tf.keras.Sequential(
    [tf.keras.layers.StringLookup(vocabulary=unique_user_occupation),
     tf.keras.layers.Embedding(len(unique_user_occupation)+1,embedding_dim)
    ]
)
# time normalizer
time_model=tf.keras.Sequential(
    [tf.keras.layers.Discretization(time_bucket.tolist()),
     tf.keras.layers.Embedding(len(time_bucket)+1,embedding_dim-4)
    ]
)
time_nirmalizer=tf.keras.layers.Normalization(axis=None)


In [None]:
time_nirmalizer.adapt(timestamp)
age_normalizer.adapt(ages)

# query tower

* user model

In [46]:
class UserModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.age_normalizer=age_normalizer
    self.gender_model=gender_model
    self.occupation_model=occupation_model
    self.time_model=time_model
    self.time_normalizer=time_nirmalizer

  def call(self,inputs):
    out=tf.concat(
        [tf.reshape(self.age_normalizer(inputs['age']),(-1,1)),
         self.gender_model(inputs['gender']),
         self.occupation_model(inputs['occupation']),
         self.time_model(inputs['time']),
         tf.reshape(self.time_normalizer(inputs['time']),(-1,1))
        ],axis=1
    )
    return out

In [30]:
class QueryTower(tf.keras.Model):
  def __init__(self,layers):
    super().__init__()
    self.user_model=UserModel()
    self.dense_model=tf.keras.Sequential()
    for layer in layers[:-1]:
      self.dense_model.add(tf.keras.layers.Dense(layer,activation='relu'))
    for layer in layers[-1:]:
      self.dense_model.add(tf.keras.layers.Dense(layer))
  def call(self,inputs):
    v=self.user_model(inputs)
    return self.dense_model(v)


# candidate tower
* movie model

In [49]:
class MovieModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.movie_model=movie_model
    self.genre_model=genre_model
    self.genre_normalizer=genre_model
  def call(self,inputs):
    out=tf.concat(
        [self.movie_model(inputs['movie_title']),
         self.genre_model(inputs['genre']),

        ],axis=1
    )
    return out

* candidate tower

In [48]:
class CandidateTower(tf.keras.Model):
  def __init__(self,layers):
    super().__init__()
    self.movie_model=MovieModel()
    self.dense=tf.keras.Sequential()
    for layer in layers[:-1]:
      self.dense.add(tf.keras.layers.Dense(layer,activation='relu'))
    for layer in layers[-1:]:
      self.dense.add(tf.keras.layers.Dense(layer))
  def call(self,inputs):
    x=self.movie_model(inputs)
    return self.dense(x)

# Metrics
In our training data we have positive (user, movie) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

To do this, we can use the tfrs.metrics.FactorizedTopK metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

In our case, that's the movies dataset, converted into embeddings via our movie model:

# task
The task itself is a Keras layer that takes the query and candidate embeddings as arguments, and returns the computed loss: we'll use that to implement the model's training loop.

# the full model
We can now put it all together into a model. TFRS exposes a base model class (tfrs.models.Model) which streamlines building models: all we need to do is to set up the components in the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

In [50]:
class CombinedModels(tfrs.models.Model):
  def __init__(self,layers):
    super().__init__()
    self.query_tower=QueryTower(layers)
    self.candidate_tower=CandidateTower(layers)
    self.task=tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movie_data.batch(128).map(self.candidate_tower)
        )
    )
  def compute_loss(self, features,training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings=self.query_tower({
        'age':features['age'],
        'gender':features['gender'],
        'occupation':features['occupation'],
        'time':features['time']
    })
    candidate_embeddings=self.candidate_tower({
        'movie_title':features['movie_title'],
        'genre':features['genre']
    })
    return self.task(query_embeddings,candidate_embeddings,compute_metrics=not training)

# preparing data to train model

In [28]:
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

# training 

In [51]:
num_epochs = 5
model = CombinedModels([64,32,16])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=1)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Top-100 accuracy: 0.10.


In [117]:
d=[i for i in data.batch(1).take(20).cache()]
index=tfrs.layers.factorized_top_k.BruteForce(model.query_tower,k=20)
'''index.index_from_dataset(
    tf.data.Dataset.zip((candidate identifiers,
                         candidate embeddings)))
)'''
index.index_from_dataset(
    tf.data.Dataset.zip((movie_title.batch(100),
                         movie_data.batch(100).map(model.candidate_tower)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f14f0e2b590>

# utility functions

In [52]:
def output_cleaner(outputs):
  (unique, counts) = np.unique(outputs.numpy()[0], return_counts=True)
  ziped_file=zip(unique,counts)
  output_array=outputs.numpy()[0]
  for uniques in unique:
    to_be_deleted=np.where(output_array==uniques)[0]
    output_array=np.delete(output_array,to_be_deleted[1:])
  dictionary=dict(ziped_file)
  ys =[]
  
  
  for i,x in enumerate(output_array):
    ys.append((f'{i+1}th movie ', x))


  return ys
      


In [53]:
def movie_feature_extractor(id_wanted):
  for j in movie_data.batch(1):
    i=0
    if j['id'].numpy()[0]==id_wanted:
      i+=1
      if i==1:
        print('movie features: \n')
        print('title: ',j['title'].numpy()[0] )
        print('genre ',j['genre'].numpy()[0])
        print('id ',j['id'].numpy()[0],'\n\n')
        break




In [54]:
def n_first_recommended_movies(results,n):
  for i in out[:n]:
    movie_feature_extractor(i[1])
    print('-------------------------')


In [55]:
def n_last_recommended_movies(results,n):
  for i in out[-n:]:
    movie_feature_extractor(i[1])
    print('-------------------------')


# making prediction

In [118]:
_,recommes=index(d[0])


In [114]:
def data_creator_for_ranker_from_UTI(results,n):
  input_data=[]
  for i in results[0][:n]:
    wanted_id=i.numpy()
    for j in data.batch(1):
      i=0

      if j['movie_id'].numpy()[0]==wanted_id:
        i+=1
        if i==1:
          input_data.append(j)
          break
  return input_data
