## Import Data from drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip ./drive/MyDrive/ds-assignment.zip -d .

Archive:  ./drive/MyDrive/ds-assignment.zip
   creating: ./ds-assignment/
  inflating: ./ds-assignment/user-interactions.csv  
  inflating: ./ds-assignment/metadata.csv  
   creating: ./__MACOSX/
   creating: ./__MACOSX/ds-assignment/
  inflating: ./__MACOSX/ds-assignment/._metadata.csv  
  inflating: ./__MACOSX/._ds-assignment  


## EDA

In [3]:
!pip install -q tensorflow-recommenders

[?25l[K     |███▊                            | 10 kB 31.4 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 37.6 MB/s eta 0:00:01[K     |███████████                     | 30 kB 45.9 MB/s eta 0:00:01[K     |██████████████▊                 | 40 kB 38.2 MB/s eta 0:00:01[K     |██████████████████▍             | 51 kB 41.7 MB/s eta 0:00:01[K     |██████████████████████          | 61 kB 46.2 MB/s eta 0:00:01[K     |█████████████████████████▊      | 71 kB 32.0 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81 kB 33.6 MB/s eta 0:00:01[K     |████████████████████████████████| 89 kB 5.6 MB/s 
[?25h

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [5]:
# Reading 100000 rows
df = pd.read_csv('./ds-assignment/user-interactions.csv', index_col=0, nrows=10000)
df.sort_values(by='updated_at')

Unnamed: 0,user_id,pratilipi_id,read_percent,updated_at
9999,5506791959794458,1377786226624397,100.0,2022-03-22 23:45:38.176
9998,5506791989472705,1377786223830664,100.0,2022-03-22 23:45:38.356
9997,5506791953970553,1377786217369353,100.0,2022-03-22 23:45:38.515
9996,5506791952924850,1377786226612455,100.0,2022-03-22 23:45:38.576
9995,5506791976797196,1377786228368115,100.0,2022-03-22 23:45:38.726
...,...,...,...,...
4,5506791992372558,1377786218111595,100.0,2022-03-23 00:08:25.250
3,5506791988747277,1377786224767880,100.0,2022-03-23 00:08:25.306
2,5506791980256358,1377786217096334,22.0,2022-03-23 00:08:26.020
1,5506791979071996,1377786219742624,29.0,2022-03-23 00:08:26.220


In [6]:
ratings = df.drop(['updated_at'], axis=1)
ratings

Unnamed: 0,user_id,pratilipi_id,read_percent
0,5506791963854965,1377786220672965,100.0
1,5506791979071996,1377786219742624,29.0
2,5506791980256358,1377786217096334,22.0
3,5506791988747277,1377786224767880,100.0
4,5506791992372558,1377786218111595,100.0
...,...,...,...
9995,5506791976797196,1377786228368115,100.0
9996,5506791952924850,1377786226612455,100.0
9997,5506791953970553,1377786217369353,100.0
9998,5506791989472705,1377786223830664,100.0


In [7]:
dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(ratings['user_id'].astype(str).values.reshape(-1,1), tf.string),
     tf.cast(ratings['pratilipi_id'].astype(str).values.reshape(-1,1), tf.string),
     tf.cast(ratings['read_percent'].values.reshape(-1,1), tf.float32)))
train_ds = dataset.take(int(0.75 * len(dataset)))
len(train_ds)

7500

In [8]:
test_ds = dataset.skip(int(0.75 * len(dataset)))
len(test_ds)

2500

In [9]:
@tf.function
def rename(x0,x1,x2):
    y = {}
    y['user_id'] = x0
    y['pratilipi_id'] = x1
    y['read_percent'] = x2
    return y

train_ds = train_ds.map(rename)
test_ds = test_ds.map(rename)

In [10]:
train_ds

<MapDataset element_spec={'user_id': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'pratilipi_id': TensorSpec(shape=(1,), dtype=tf.string, name=None), 'read_percent': TensorSpec(shape=(1,), dtype=tf.float32, name=None)}>

In [11]:
books = ratings['pratilipi_id'].astype(str).values
users = ratings['user_id'].astype(str).values

unique_book_titles = np.unique(list(books))
unique_user_ids = np.unique(list(users))

In [12]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for books.
    self.book_embeddings = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_book_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_book_titles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def __call__(self, x):
    
    user_id, book_name = x
    user_embedding = self.user_embeddings(user_id)
    book_embedding = self.book_embeddings(book_name)

    return self.ratings(tf.concat([user_embedding, book_embedding], axis=1))

In [13]:
class BooksModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()

    self.ranking_model: tf.keras.Model = RankingModel()

    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def compute_loss(self, features, training=False) -> tf.Tensor:
    print(features)
    rating_predictions = self.ranking_model((features['user_id'], features['pratilipi_id']))

    # The task computes the loss and the metrics.
    return self.task(labels=features['read_percent'], predictions=rating_predictions)

In [14]:
model = BooksModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
# Cache the dataset 
cache_train_ds = train_ds.cache()
# Tensorboard 
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
# Training 
model.fit(cache_train_ds, validation_data=test_ds, epochs=2,
                    verbose=1, callbacks=[tensorboard_callback])

Epoch 1/2
{'user_id': <tf.Tensor 'IteratorGetNext:2' shape=(1,) dtype=string>, 'pratilipi_id': <tf.Tensor 'IteratorGetNext:0' shape=(1,) dtype=string>, 'read_percent': <tf.Tensor 'IteratorGetNext:1' shape=(1,) dtype=float32>}
{'user_id': <tf.Tensor 'IteratorGetNext:2' shape=(1,) dtype=string>, 'pratilipi_id': <tf.Tensor 'IteratorGetNext:0' shape=(1,) dtype=string>, 'read_percent': <tf.Tensor 'IteratorGetNext:1' shape=(1,) dtype=float32>}
Epoch 2/2


<keras.callbacks.History at 0x7f8c02d32a10>

In [40]:
# Find top 20 books for the user_id "5506791992269694"
user8 = np.array(["5506791992269694" for i in range(len(unique_book_titles))])
# Convert it to tf.data.Dataset 
test_data = tf.data.Dataset.from_tensor_slices((tf.cast(user8.reshape(-1,1), tf.string), tf.cast(unique_book_titles.reshape(-1,1), tf.string)))
# Name the columns 
@tf.function
def rename_test(x0,x1):
    y = {}
    y["user_id"] = x0
    y['pratilipi_id'] = x1
    return y
test_data = test_data.map(rename_test)
# Now lets make predictions and store them in to dictionary
test_ratings = {}
for b in test_data:
    test_ratings[b['pratilipi_id'].numpy()[0]] = model.ranking_model((b['user_id'],b['pratilipi_id']))
# sort them by score and print the titles 
i = 0
for b in sorted(test_ratings, key=test_ratings.get, reverse=True):
    print(b)
    if i==10:
      break
    i += 1

b'1377786225708711'
b'1377786228189551'
b'1377786227169761'
b'1377786221510764'
b'1377786227147146'
b'1377786222463691'
b'1377786224822301'
b'1377786224860795'
b'1377786223040210'
b'1377786224821569'
b'1377786228109947'
