In [1]:
import tensorflow as tf
print(tf.__version__)

import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

import os
import pprint

from typing import Dict, Text
import pandas as pd
import numpy as np

import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

2.8.0


In [2]:
def read_files(file_path, **kwargs):
    
    art_df_args = dict(filepath_or_buffer=file_path + 'articles.csv',low_memory = False)
    if 'art_cols' in kwargs:
        art_df_args['usecols']=kwargs['art_cols']
    
    cust_df_args = dict(filepath_or_buffer=file_path + 'customers.csv', low_memory = False)
    if  'cust_cols' in  kwargs:
        cust_df_args['usecols']=kwargs['cust_cols']
    
    trans_df_args= dict(filepath_or_buffer=file_path + 'transactions_train.csv', low_memory = False)
    if  'trans_cols' in kwargs:
        trans_df_args['usecols']=kwargs['trans_cols']
    
    art_df = pd.read_csv(**art_df_args)
    cust_df = pd.read_csv(**cust_df_args)
    trans_df= pd.read_csv(**trans_df_args)
    
    customer_lookup = cust_df.reset_index().set_index('customer_id')['index'].astype(str).to_dict()
    article_lookup =art_df.reset_index().set_index('article_id')['index'].astype(str).to_dict()
    
    trans_df['user_id']= trans_df['customer_id'].map(customer_lookup)
    trans_df['item_id']= trans_df['article_id'].map(article_lookup)
    
    unique_users = trans_df['user_id'].unique()
    unique_items = trans_df['item_id'].unique()
    
    trans_df = trans_df.drop(columns =['customer_id','article_id'])
    
    return customer_lookup, article_lookup, trans_df, unique_users, unique_items

In [3]:
%%time
#cust_cols=['customer_id']
#trans_cols= ['customer_id','article_id']
file_path = '../input/h-and-m-personalized-fashion-recommendations/'
customer_lookup, article_lookup, trans_data, user_vocab, item_vocab = read_files(file_path, cust_cols=['customer_id'], trans_cols= ['customer_id','article_id'])

Wall time: 37.1 s


In [4]:
%%time
train_size =0.80
np.random.seed(1221)
train = trans_data[['user_id','item_id']].sample(frac=train_size)
test =  trans_data[['user_id','item_id']].drop(train.index)

train = tf.data.Dataset.from_tensor_slices(dict(train))
test = tf.data.Dataset.from_tensor_slices(dict(test))

Wall time: 8.25 s


In [5]:
items = tf.data.Dataset.from_tensor_slices(item_vocab)

In [7]:
embedding_dimension = 32

In [8]:
## 4. Convert user_ids to integers and convert them embeddings visa Embedding layer
## Query tower

user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = user_vocab, mask_token =None),
    tf.keras.layers.Embedding(len(user_vocab)+1, embedding_dimension)])

In [9]:
## Candidate tower

item_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
        vocabulary = item_vocab, mask_token =None),
    tf.keras.layers.Embedding(len(item_vocab)+1, embedding_dimension)
])

In [10]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=items.batch(256).map(item_model))

task = tfrs.tasks.Retrieval(metrics=metrics)

In [11]:
class UserItemModel(tfrs.Model):
    
    def __init__(self, user_model, item_model):
        super().__init__()
        self.user_model : tf.keras.Model = user_model
        self.item_model : tf.keras.Model = item_model
        self.task : tf.keras.layers.Layer = task
            
    def compute_loss(self,features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        
        user_embeddings = self.user_model(features['user_id'])
        positive_item_embeddings = self.item_model(features['item_id'])
        
        return self.task(user_embeddings,positive_item_embeddings)

In [12]:
model = UserItemModel(user_model, item_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [13]:
cached_train = train.batch(16384).cache()
cached_test = test.batch(4096).cache()

In [14]:
%%time
model.fit(cached_train, epochs=3)

Epoch 1/3
  34/1553 [..............................] - ETA: 5:10:38 - factorized_top_k/top_1_categorical_accuracy: 1.9747e-05 - factorized_top_k/top_5_categorical_accuracy: 0.0017 - factorized_top_k/top_10_categorical_accuracy: 0.0028 - factorized_top_k/top_50_categorical_accuracy: 0.0078 - factorized_top_k/top_100_categorical_accuracy: 0.0119 - loss: 158989.9104 - regularization_loss: 0.0000e+00 - total_loss: 158989.9104

KeyboardInterrupt: 