the retrieval stage is responsible for selecting an initial set of hundreds of candidates from all possible candidates. The main objective of this model is to efficiently weed out all candidates that the user is not interested in. Because the retrieval model may be dealing with millions of candidates, it has to be computationally efficient.

etrieval models are often composed of two sub-models:

1. A query model computing the query representation (normally a fixed-dimensionality embedding vector) using query features.
2. A candidate model computing the candidate representation (an equally-sized vector) using the candidate features
The outputs of the two models are then multiplied together to give a query-candidate affinity score, with higher scores expressing a better match between the candidate and the query.

cabdidate representation will be created from:
1. movie title
2. movie genre 
user representation will be created from:
1. user age.
2. user occupation.
3. user gender.
4. time as a contextual feature

# importing necessary libreries

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets


[K     |████████████████████████████████| 85 kB 2.9 MB/s 
[K     |████████████████████████████████| 462 kB 32.3 MB/s 
[K     |████████████████████████████████| 4.2 MB 5.4 MB/s 
[?25h

In [None]:
import os
import tensorflow as tf
import tensorflow_recommenders as tfrs
import numpy as np
import pprint

# loading data from drive

In [None]:
#movies=tf.data.experimental.load('/content/drive/MyDrive/datasets/movielens_movies')
ratings=tf.data.experimental.load('/content/drive/MyDrive/datasets/movielens_ratings')
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

In [None]:

user_data=ratings.map(lambda x:{
    'id':x['user_id'],
    'age':x['raw_user_age'],
    'gender':x['user_gender'],
    'occupation':x['user_occupation_text'],

})





In [6]:
gpus = tf.config.list_physical_devices("GPU")
if gpus:
  # Create 2 virtual GPUs with 1GB memory each
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024),
         tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices("GPU")
    print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

strategy = tf.distribute.MirroredStrategy()

Virtual devices cannot be modified after being initialized
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [None]:
for x in user_data.take(1).as_numpy_iterator():
  pprint.pprint(x)
print('-------------------------------')


{'age': 46.0, 'gender': 1.0, 'id': b'138', 'occupation': b'doctor'}
-------------------------------


# creating a vocabulary for movie title and user occupation

In [None]:

user_occupation=user_data.map(lambda x :x['occupation'])
unique_user_occupation=np.unique(np.concatenate(list(user_occupation.batch(1_000))))



user_id=movie_genres=user_data.map(lambda x :x['id'])
unique_user_id=np.unique(np.concatenate(list(user_id.batch(1_000))))


# spliting data to train and test 

In [None]:
tf.random.set_seed(123)
shuffled=user_data.shuffle(1_000,reshuffle_each_iteration=False)

train=shuffled.take(80_000)
test=shuffled.skip(80_000).take(10_000)

# building model
Because we are building a two-tower retrieval model, we can build each tower separately and then combine them in the final model.

#  creating seprate models

In [None]:
# with strategy.scope():
# age normalizer
age_normalizer=tf.keras.layers.Normalization(
    axis=None
)
ages=user_data.map(lambda x :x['age'])
age_normalizer.adapt(ages)


In [None]:
embedding_dim=128
# gender model
# with strategy.scope():
gender_model=tf.keras.Sequential(
    [tf.keras.layers.IntegerLookup(vocabulary=[2])
    ,tf.keras.layers.Embedding(3,2)

    ]
)


# occupation model
occupation_model=tf.keras.Sequential(
    [tf.keras.layers.StringLookup(vocabulary=unique_user_occupation),
    tf.keras.layers.Embedding(len(unique_user_occupation)+1,embedding_dim)
    ]
)


# query tower

* user model

In [None]:
# with strategy.scope():  
class UserModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.age_normalizer=age_normalizer
    self.gender_model=gender_model
    self.occupation_model=occupation_model

  @tf.function
  def call(self,inputs):
    out=tf.concat(
        [tf.reshape(self.age_normalizer(inputs['age']),(-1,1)),
        tf.cast(self.gender_model(tf.cast(inputs['gender'],dtype=tf.float32)),dtype=tf.float32),
        self.occupation_model(inputs['occupation']),
        ],axis=1
    )
    return out

In [None]:
# with strategy.scope():
class QueryTower(tf.keras.Model):
  def __init__(self,layers):
    super().__init__()
    self.user_model=UserModel()
    self.dense_model=tf.keras.Sequential()
    for layer in layers[:-1]:
      self.dense_model.add(tf.keras.layers.Dense(layer,activation='relu'))
    for layer in layers[-1:]:
      self.dense_model.add(tf.keras.layers.Dense(layer))
  @tf.function
  def call(self,inputs):
    v=self.user_model(inputs)
    return self.dense_model(v)


# candidate tower
* movie model

In [None]:
from typing import cast
# with strategy.scope():  
class MovieModel(tf.keras.Model):
  
    def __init__(self):
      super().__init__()
      self.age_normalizer=age_normalizer
      self.gender_model=gender_model
      self.occupation_model=occupation_model

    @tf.function
    def call(self,inputs):

      out=tf.concat(
        [tf.reshape(self.age_normalizer(inputs['age']),(-1,1)),
        tf.cast(self.gender_model(inputs['gender']),dtype=tf.float32),
        self.occupation_model(inputs['occupation']),
        ],axis=1
    )
      return out

* candidate tower

In [None]:
# with strategy.scope():
class CandidateTower(tf.keras.Model):
  def __init__(self,layers):
    super().__init__()
    self.user_model=UserModel()
    self.dense_model=tf.keras.Sequential()
    for layer in layers[:-1]:
      self.dense_model.add(tf.keras.layers.Dense(layer,activation='relu'))
    for layer in layers[-1:]:
      self.dense_model.add(tf.keras.layers.Dense(layer))
  @tf.function
  def call(self,inputs):
    v=self.user_model(inputs)
    return self.dense_model(v)

# Metrics
In our training data we have positive (user, user) pairs. To figure out how good our model is, we need to compare the affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.

To do this, we can use the tfrs.metrics.FactorizedTopK metric. The metric has one required argument: the dataset of candidates that are used as implicit negatives for evaluation.

In our case, that's the movies dataset, converted into embeddings via our movie model:

# task
The task itself is a Keras layer that takes the query and candidate embeddings as arguments, and returns the computed loss: we'll use that to implement the model's training loop.

# the full model
We can now put it all together into a model. TFRS exposes a base model class (tfrs.models.Model) which streamlines building models: all we need to do is to set up the components in the __init__ method, and implement the compute_loss method, taking in the raw features and returning a loss value.

In [None]:
# with strategy.scope():
class CombinedModels(tfrs.models.Model):
  def __init__(self,layers):
    super().__init__()
    self.query_tower=QueryTower(layers)
    self.candidate_tower=CandidateTower(layers)
    self.task=tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=user_data.batch(128).map(self.candidate_tower)
        )
    )
  @tf.function
  def compute_loss(self, features,training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings=self.query_tower({
        'age':features['age'],
        'gender':features['gender'],
        'occupation':features['occupation'],
    })
    candidate_embeddings=self.candidate_tower({
        'age':features['age'],
        'gender':features['gender'],
        'occupation':features['occupation'],
    })
    return self.task(query_embeddings,candidate_embeddings,compute_metrics=not training)

# preparing data to train model

In [None]:
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

# training 

In [None]:
num_epochs = 5
# with strategy.scope():
model = CombinedModels([64,32,16])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=1)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print('========================================')
#print(f"Top-100 accuracy: {accuracy:.2f}.")

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

# making prediction

In [None]:
d=[i for i in train.batch(1).take(20).cache()]


In [None]:
top_k=1000
index=tfrs.layers.factorized_top_k.BruteForce(model.query_tower,k=top_k)
'''index.index_from_dataset(
    tf.data.Dataset.zip((candidate identifiers,
                         candidate embeddings)))
)'''
index.index_from_dataset(
    tf.data.Dataset.zip((user_data.map(lambda x : x['id']).batch(100),
                         user_data.batch(100).map(model.candidate_tower)))
)


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f2cd390fb90>

# utility functions

In [None]:
def output_cleaner(outputs):
  (unique, counts) = np.unique(outputs.numpy()[0], return_counts=True)
  ziped_file=zip(unique,counts)
  output_array=outputs.numpy()[0]
  for uniques in unique:
    to_be_deleted=np.where(output_array==uniques)[0]
    output_array=np.delete(output_array,to_be_deleted[1:])
  dictionary=dict(ziped_file)
  ys =[]
  
  
  for i,x in enumerate(output_array):
    ys.append((f'{i+1}th person ', x))

  return ys
      


In [None]:
def user_feature_extractor(wanted_id):
  for j in user_data.batch(1):
    i=0
    if j['id'].numpy()[0]==wanted_id:
      i+=1
      if i==1:
        print('user features: \n')
        print('age: ',j['age'].numpy()[0] )
        print('gender ',j['gender'].numpy()[0])
        print('occupation ',j['occupation'].numpy()[0])
        print('id ',j['id'].numpy()[0],'\n\n')
        break




In [None]:
def input_data_creater_for_item_item(results,n):
  input_data=[]
  
  for i in results[:n]:
    wanted_id=i[1]
    for j in user_data.batch(1):
      i=0
      if j['id'].numpy()[0]==wanted_id:
        i+=1
        if i==1:
          input_data.append(j)
          break
  return input_data


In [None]:
def n_first_recommended_users(results,n):
  for i in results[:n]:
    user_feature_extractor(i[1])
    print('-------------------------')


In [None]:
def n_last_recommended_users(results,n):
  for i in results[-n:]:
    user_feature_extractor(i[1])
    print('-------------------------')


# print predictions

In [None]:
print(f"{top_k} users similar to user:{d[1]['id'].numpy()[0]} with below features \n\n")
user_feature_extractor(d[1]['id'].numpy()[0])
_,recommes=index(d[1])
recommended_users=output_cleaner(recommes)


In [None]:
n_first_recommended_users(recommended_users,10)