In [None]:
!pip install tensorflow-recommenders


In [None]:
!pip install scann

In [None]:
import tensorflow as tf
print(tf.__version__)

import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

import os
import pprint

from typing import Dict, Text

import pandas as pd
import numpy as np
import time


from sklearn.model_selection import train_test_split

In [None]:

def read_files(file_path, **kwargs):
    
    art_df_args = dict(filepath_or_buffer=file_path + 'articles.csv',low_memory = False)
    if 'art_cols' in kwargs:
        art_df_args['usecols']=kwargs['art_cols']
    
    cust_df_args = dict(filepath_or_buffer=file_path + 'customers.csv', low_memory = False)
    if  'cust_cols' in  kwargs:
        cust_df_args['usecols']=kwargs['cust_cols']
    
    trans_df_args= dict(filepath_or_buffer=file_path + 'transactions_train.csv', low_memory = False)
    if  'trans_cols' in kwargs:
        trans_df_args['usecols']=kwargs['trans_cols']
    
    art_df = pd.read_csv(**art_df_args)
    cust_df = pd.read_csv(**cust_df_args)
    trans_df= pd.read_csv(**trans_df_args)
    
    customer_lookup = cust_df.reset_index().set_index('customer_id')['index'].astype(str).to_dict()
    article_lookup =art_df.reset_index().set_index('article_id')['index'].astype(str).to_dict()
    
    trans_df['user_id']= trans_df['customer_id'].map(customer_lookup)
    trans_df['item_id']= trans_df['article_id'].map(article_lookup)
    
    unique_users = trans_df['user_id'].unique()
    unique_items = trans_df['item_id'].unique()
    
    trans_df = trans_df.drop(columns =['customer_id','article_id'])
    
    return customer_lookup, article_lookup, trans_df, unique_users, unique_items

    

In [None]:
%%time
#cust_cols=['customer_id']
#trans_cols= ['customer_id','article_id']
file_path = '../input/h-and-m-personalized-fashion-recommendations/'
customer_lookup, article_lookup, trans_data, user_vocab, item_vocab = read_files(file_path, cust_cols=['customer_id'], trans_cols= ['customer_id','article_id'])

## For the retrieval model we need -
1. Query tower - embeddings for user_ids
2. Candidate tower - embeddings for artilce_ids

Follow the steps below:

    1. Keep just the user_id and article_id
    
    2. Convert pd.DataFrame to tf.data.dataset 
    
    3. Split into train and test data
    
    4. Convert user_ids to integers and convert them embeddings visa Embedding layer
   

In [None]:
%%time
train_size =0.80
np.random.seed(1221)
train = trans_data[['user_id','item_id']].sample(frac=train_size)
test =  trans_data[['user_id','item_id']].drop(train.index)

train = tf.data.Dataset.from_tensor_slices(dict(train))
test = tf.data.Dataset.from_tensor_slices(dict(test))


In [None]:
items = tf.data.Dataset.from_tensor_slices(item_vocab)

## Batches in tensorflow dataset
#### ratings.batch(1_000_000, drop_remainder = True) - this devides a tensorflow data set into equal batches of batch size = 1000000. Total unique number of user_ids are 31.78 million.

##### 31.78 million/1 million = no of batches are 32 
##### 31.78 million/1 million = no of batches are 31 if the drop_remainder is True

### tf.keras.layers.StringLookup - A preprocessing layer that maps string features to integers
### tf.keras.layers.Embedding - Turns indexes into dense vectors of fixed size



In [None]:
embedding_dimension = 32

In [None]:
## 4. Convert user_ids to integers and convert them embeddings visa Embedding layer
## Query tower

user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = user_vocab, mask_token =None),
    tf.keras.layers.Embedding(len(user_vocab)+1, embedding_dimension)])

In [None]:
## Candidate tower

item_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = item_vocab, mask_token =None),
    tf.keras.layers.Embedding(len(item_vocab)+1, embedding_dimension)
])

#### This computes metrics for across top K candidates surfaced by a retrieval model.
#### The default metric is top K categorical accuracy : how often the true candidate is in in top K candidates for a given query

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=items.batch(256).map(item_model))

task = tfrs.tasks.Retrieval(metrics=metrics)

## We can put it all together in a model: (Returns user embeddings and positive item embeddings)
#### 1. User model
#### 2. Item model
#### 3. Retrieval task layer


In [None]:
class UserItemModel(tfrs.Model):
    
    def __init__(self, user_model, item_model):
        super().__init__()
        self.user_model : tf.keras.Model = user_model
        self.item_model : tf.keras.Model = item_model
        self.task : tf.keras.layers.Layer = task
            
    def compute_loss(self,features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        
        user_embeddings = self.user_model(features['user_id'])
        positive_item_embeddings = self.item_model(features['item_id'])
        
        return self.task(user_embeddings,positive_item_embeddings)

In [None]:
model = UserItemModel(user_model, item_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
cached_train = train.batch(16384).cache()
cached_test = test.batch(4096).cache()

In [None]:
%%time
model.fit(cached_train, epochs=3)