In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import tensorflow_recommenders as tfrs
from typing import Dict, Text

In [2]:
df = pd.read_pickle('./datasets/clean_df.csv', compression='zip')

In [3]:
df.shape

(772661, 4)

https://medium.com/when-i-work-data/converting-a-pandas-dataframe-into-a-tensorflow-dataset-752f3783c168

In [4]:
dataset = (
    tf.data.Dataset.from_tensor_slices(
            {
                'user' : df['user'].to_numpy(),
                'song' : df['song'].to_numpy(),
                'Song - Artist' : df['Song - Artist'].to_numpy(),
                'count' : df['count'].to_numpy()
            }
        ))

In [5]:
# Select the basic features.
listens = dataset.map(lambda x: {
    'song' : x['song'],
    'user' : x['user']
})

songs = dataset.map(lambda x: x['song'])

In [6]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(dataset.map(lambda x: x['user']))

song_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
song_ids_vocabulary.adapt(dataset.map(lambda x: x['song']))

In [7]:
class UserSongModel(tfrs.Model):
    # We derive from a custom base class to help reduce boilerplate. Under the hood,
    # these are still plain Keras Models.

    def __init__(self,
                 user_model: tf.keras.Model,
                 song_model: tf.keras.Model,
                 task: tfrs.tasks.Retrieval):
        super().__init__()

        # Set up user and movie representations.
        self.user_model = user_model
        self.song_model = song_model

        # Set up a retrieval task.
        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # Define how the loss is computed.

        user_embeddings = self.user_model(features['user'])
        song_embeddings = self.song_model(features['song'])

        return self.task(user_embeddings, song_embeddings)

In [8]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])
song_model = tf.keras.Sequential([
    song_ids_vocabulary,
    tf.keras.layers.Embedding(song_ids_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    songs.batch(128).map(song_model)
  )
)

In [9]:
# Create a retrieval model.
model = UserSongModel(user_model, song_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [None]:
# Train for 3 epochs.
model.fit(listens.batch(2048), epochs=3)

Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
  2/378 [..............................] - ETA: 8:33:59 - factorized_top_k/top_1_categorical_accuracy: 0.0691 - factorized_top_k/top_5_categorical_accuracy: 0.0696 - factorized_top_k/top_10_categorical_accuracy: 0.0696 - factorized_top_k/top_50_categorical_accuracy: 0.0703 - factorized_top_k/top_100_categorical_accuracy: 0.0706 - loss: 15615.0811 - regularization_loss: 0.0000e+00 - total_loss: 15615.0811                    

In [None]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(songs.batch(100).map(model.song_model), songs)

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")

In [None]:
# dataset = (
#     tf.data.Dataset.from_tensor_slices(
#         (
#             tf.cast(df['user'].values, tf.string),
#             tf.cast(df['song'].values, tf.string),
#             tf.cast(df['Song - Artist'].values, tf.string),
#             tf.cast(df['count'].values, tf.int32)
#         )))

In [None]:
# # Convert to list
# user_list = df['user'].to_list()
# song_list = df['song'].to_list()
# count_list = df['count'].to_list()
# song_artist_list = df['Song - Artist'].to_list()

In [None]:
# # Convert user id and song id to numerical ids
# df['user_id'] = df['user'].astype('category').cat.codes
# df['song_id'] = df['song'].astype('category').cat.codes

# # Create lookup frame so we can get the 'Song - Artist' later
# item_lookup = df[['song_id','Song - Artist']].drop_duplicates()
# item_lookup['song_id'] = item_lookup['song_id'].astype(str)

# # Drop 'user' and 'song' and 'Song - Artist'
# df = df.drop(['user','song','Song - Artist'], axis=1)

# df = df[['song_id','user_id','count']]

# # Get number of unique entities in songs & users columns
# nsong_id = df.song_id.nunique()
# nuser_id = df.user_id.nunique()

# df.head()