<a href="https://colab.research.google.com/github/seyone22/ai_testing/blob/main/Movies_DB_Sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow_datasets as tfds;

ratings_dataset, ratings_dataset_info = tfds.load(
    name='movielens/100k-ratings',
    split='train',
    with_info=True
)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/100000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-ratings/0.1.1.incomplete6L08NH/movielens-train.tfrecord*...…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.1. Subsequent calls will reuse this data.


In [None]:
import tensorflow as tf
assert isinstance(ratings_dataset, tf.data.Dataset)

print("ratings_dataset size: %d" % ratings_dataset.__len__())

ratings_dataset size: 100000


In [None]:
print(
    tfds.as_dataframe(ratings_dataset.take(5), ratings_dataset_info)
)

   bucketized_user_age movie_genres movie_id  \
0                 45.0          [7]   b'357'   
1                 25.0      [4, 14]   b'709'   
2                 18.0          [4]   b'412'   
3                 50.0       [5, 7]    b'56'   
4                 50.0     [10, 16]   b'895'   

                                 movie_title  raw_user_age  timestamp  \
0  b"One Flew Over the Cuckoo's Nest (1975)"          46.0  879024327   
1                b'Strictly Ballroom (1992)'          32.0  875654590   
2             b'Very Brady Sequel, A (1996)'          24.0  882075110   
3                     b'Pulp Fiction (1994)'          50.0  883326919   
4                         b'Scream 2 (1997)'          55.0  891409199   

   user_gender user_id  user_occupation_label user_occupation_text  \
0         True  b'138'                      4            b'doctor'   
1         True   b'92'                      5     b'entertainment'   
2         True  b'301'                     17           b'stud

In [None]:
ratings_dataset = ratings_dataset.map(
    lambda rating: {
        'user_id' : rating['user_id'],
        'movie_id' : rating['movie_id'],
        'movie_title' : rating['movie_title'],
        'user_rating' : rating['user_rating'],
        'timestamp' : rating['timestamp']
    }
)

print(
    tfds.as_dataframe(ratings_dataset.take(5), ratings_dataset_info)
)

  movie_id                                movie_title  timestamp user_id  \
0   b'357'  b"One Flew Over the Cuckoo's Nest (1975)"  879024327  b'138'   
1   b'709'                b'Strictly Ballroom (1992)'  875654590   b'92'   
2   b'412'             b'Very Brady Sequel, A (1996)'  882075110  b'301'   
3    b'56'                     b'Pulp Fiction (1994)'  883326919   b'60'   
4   b'895'                         b'Scream 2 (1997)'  891409199  b'197'   

   user_rating  
0          4.0  
1          2.0  
2          4.0  
3          4.0  
4          3.0  


In [None]:
trainset_size = 0.8 * ratings_dataset.__len__().numpy()

tf.random.set_seed(42)

ratings_dataset_shuffled = ratings_dataset.shuffle(
    buffer_size = 100_000,
    seed = 42,
    reshuffle_each_iteration = False
)

ratings_trainset = ratings_dataset_shuffled.take(trainset_size)
ratings_testset = ratings_dataset_shuffled.skip(trainset_size)

print(
    "ratings_trainset size: %d" % ratings_trainset.__len__()
)
print(
    "ratings_testset size: %d" % ratings_testset.__len__()
)

ratings_trainset size: 80000
ratings_testset size: 20000


## Preprocessing

In [None]:
from pprint import pprint

for rating in ratings_trainset.take(1).as_numpy_iterator():
  pprint(rating)

{'movie_id': b'898',
 'movie_title': b'Postman, The (1997)',
 'timestamp': 885409515,
 'user_id': b'681',
 'user_rating': 4.0}


In [None]:
timestamp_normalization_layer = \
  tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

timestamp_normalization_layer.adapt(
    ratings_trainset.map(
        lambda x: x['timestamp']
    )
)

for rating in ratings_trainset.take(3).as_numpy_iterator():
  print(
      f"Raw Timestamp: {rating['timestamp']} ->",
      f"Normalized Timestamp: {timestamp_normalization_layer(rating['timestamp'])}"
  )

Raw Timestamp: 885409515 -> Normalized Timestamp: 0.3537561595439911
Raw Timestamp: 883388887 -> Normalized Timestamp: -0.02487170137465
Raw Timestamp: 891249586 -> Normalized Timestamp: 1.4480509757995605


In [None]:
user_id_lookup_layer = \
  tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

user_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda x: x['user_id']
    )
)

print(
    f"Vocabulary[:10] -> {user_id_lookup_layer.get_vocabulary()[:10]}"
)

print(
    "Mapped integer for user ids: ['-2', '13', '655', 'xxx']\n",
    user_id_lookup_layer(
        ['-2', '13', '655', 'xxx']
    )
)

user_id_embedding_dim = 32

user_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=user_id_lookup_layer.vocabulary_size(),
    output_dim=user_id_embedding_dim
)

user_id_model = tf.keras.Sequential(
    [
        user_id_lookup_layer,
        user_id_embedding_layer
    ]
)

print(
    "Embeddings for user ids: ['-2', '13', '655', 'xxx]\n",
    user_id_model(
        ['-2', '13', '655', 'xxx']
    )
)

Vocabulary[:10] -> ['[UNK]', '405', '655', '13', '450', '276', '303', '416', '537', '234']
Mapped integer for user ids: ['-2', '13', '655', 'xxx']
 tf.Tensor([0 3 2 0], shape=(4,), dtype=int64)
Embeddings for user ids: ['-2', '13', '655', 'xxx]
 tf.Tensor(
[[-0.00010215  0.03272604  0.04851306 -0.0388288   0.03742054  0.00798118
  -0.02392949 -0.02633275  0.03902603 -0.01342239 -0.01201311  0.04303907
   0.0253997   0.0380276   0.00792763 -0.04685153 -0.03707243  0.03793109
   0.01287177  0.03135307  0.04443446 -0.01292812  0.03648994 -0.04899253
  -0.03571009 -0.04297311 -0.02164481 -0.00864841 -0.02694016  0.02636368
   0.02108225  0.04794696]
 [ 0.00881778  0.0259969  -0.00660605 -0.04881135 -0.04346757  0.02167315
   0.03481412  0.0236656  -0.01279006  0.01893636 -0.0100298  -0.00731535
   0.02948416 -0.02665333  0.0478559  -0.01779496  0.02985099 -0.02123188
   0.02380437 -0.0098368  -0.01927636 -0.02980432 -0.01810442 -0.03447986
  -0.03480498  0.03611714  0.00071067 -0.00638155 

In [None]:
movie_id_lookup_layer = \
  tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)

movie_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda x: x['movie_id']
    )
)

movie_id_embedding_dim = 32

movie_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=movie_id_lookup_layer.vocab_size(),
    output_dim=movie_id_embedding_dim
)

movie_id_model = tf.keras.Sequential(
    [
        movie_id_lookup_layer,
        movie_id_embedding_layer
    ]
)

print(
    f"Embedding for the movie 898:\n {movie_id_model('898')}"
)



Embedding for the movie 898:
 [ 0.01910404 -0.03749777 -0.04638615 -0.02166903  0.03420725  0.01277484
  0.00469745  0.03847973 -0.00670559  0.04122807 -0.02789997  0.02906981
 -0.0409052  -0.0399387  -0.04481171  0.02097292 -0.02037445 -0.04249878
  0.01048335  0.04407717  0.03340428  0.00127715  0.01452162  0.04426448
 -0.01428209 -0.02973808 -0.04910673 -0.03422158  0.03295201 -0.00646251
 -0.04213738  0.03063088]


In [None]:
movie_title_vectorization_layer = \
  tf.keras.layers.experimental.preprocessing.TextVectorization()
movie_title_vectorization_layer.adapt(
    ratings_trainset.map(
        lambda rating: rating['movie_title']
    )
)

print(
    "Vocabulary[40:50] -> ",
    movie_title_vectorization_layer('Postman, The (1997)')
)

movie_title_embedding_dim = 32
movie_title_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(movie_title_vectorization_layer.get_vocabulary()),
    output_dim=movie_title_embedding_dim,
    mask_zero=True
)

movie_title_model = tf.keras.Sequential(
    [
        movie_title_vectorization_layer,
        movie_title_embedding_layer,
        tf.keras.layers.GlobalAveragePooling1D()
    ]
)

Vocabulary[40:50] ->  tf.Tensor([1120    2    4], shape=(3,), dtype=int64)


## Query and Candidate Representation


In [None]:
query_model = user_id_model

candidate_model = movie_id_model

retrieval_ratings_trainset = ratings_trainset.map(
    lambda rating: {
        'user_id' : rating['user_id'],
        'movie_id' : rating['movie_id']
    }
)
retrieval_ratings_testset = ratings_testset.map(
    lambda rating: {
        'user_id' : rating['user_id'],
        'movie_id' : rating['movie_id']
    }
)

In [None]:
movies_dataset, movies_dataset_info = tfds.load(
    name='movielens/100k-movies',
    split='train',
    with_info=True
)

print(
    tfds.as_dataframe(movies_dataset.take(5), movies_dataset_info)
)

candidates_corpus_dataset = movies_dataset.map(
    lambda movie: movie['movie_id']
)

Downloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/1682 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/movielens/100k-movies/0.1.1.incomplete1SU0VS/movielens-train.tfrecord*...:…

Dataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.1. Subsequent calls will reuse this data.
  movie_genres movie_id                      movie_title
0          [4]  b'1681'           b'You So Crazy (1994)'
1       [4, 7]  b'1457'   b'Love Is All There Is (1996)'
2       [1, 3]   b'500'          b'Fly Away Home (1996)'
3          [0]   b'838'  b'In the Line of Duty 2 (1987)'
4          [7]  b'1648'       b'Niagara, Niagara (1997)'


In [15]:
!pip install -q scann tensorflow-recommenders
import tensorflow_recommenders as tfrs

factorized_top_k_metrics = tfrs.metrics.FactorizedTopK(
    candidates=candidates_corpus_dataset.batch(128).map(
        candidate_model
    )
)