# Sample Neural Collaborative Filtering model

Based on He et al. paper and a tutorial based on it foudn at https://www.kaggle.com/code/curiousraccoon/deep-learning-based-recommender-systems/edit.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)


In [None]:
num_pop_items = 15
one_month_before_val_date = '2020-08-15'
neg_sample_num = 3
days_max_diff = 14
train_begin_date = "2020-03-15"
train_end_date = "2020-09-15"
val_end_date = "2020-09-22"

## Load data

In [None]:
def load_data_frames():
    # load csv files
    data_path = '../input/h-and-m-personalized-fashion-recommendations/'
    csv_train = f'{data_path}transactions_train.csv'
    csv_sub = f'{data_path}sample_submission.csv'
    csv_users = f'{data_path}customers.csv'
    csv_items = f'{data_path}articles.csv'

    df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
    df_sub = pd.read_csv(csv_sub)
    dfu = pd.read_csv(csv_users)
    dfi = pd.read_csv(csv_items, dtype={'article_id': str})
    
    return df, df_sub, dfu, dfi

# df - transaction dataframe
# df_sub - submission datafram
# dfu - customers dataframe
# dfi - articles dataframe

df, df_sub, dfu, dfi = load_data_frames()


In [None]:
# create the dictionary
def create_item_group_dict():
    group_dict = {}
    groups = dfi.groupby(['index_name', 'product_group_name', 'section_name', 'product_type_name']).groups

    for key, value in groups.items():
        value = list(value)
        index_name = key[0]
        product_group_name = key[1]
        section_name = key[2]
        product_type_name = key[3]
    
        if not (index_name in group_dict):
            group_dict[index_name] = {
                product_group_name: {
                    section_name: {
                        product_type_name: value
                    }
                }
            }
    
        elif not (product_group_name in group_dict[index_name]):
            group_dict[index_name][product_group_name] ={
                section_name: {
                    product_type_name: value
                }
            }

        elif not (section_name in group_dict[index_name][product_group_name]):
            group_dict[index_name][product_group_name][section_name] = {
                product_type_name: value
            }
        else:
            group_dict[index_name][product_group_name][section_name][product_type_name] = value
            
    return group_dict

group_dict = create_item_group_dict()

In [None]:
# modify the above dictionary function to merge all the smaller dictionaries into one and replicate this same set for every
# small item group at the lower dict level.
def merge_small_groups(group_dict):
    for index_name in group_dict:
        for product_group_name in group_dict[index_name]:
            for section_name in group_dict[index_name][product_group_name]:
                small_section_list = []
                small_section_item_list = []

                for product_type_name in group_dict[index_name][product_group_name][section_name]:
                    curr_group = group_dict[index_name][product_group_name][section_name][product_type_name]
                    if len(curr_group) < neg_sample_num + 1:
                        small_section_list.append(product_type_name)
                        small_section_item_list = small_section_item_list + curr_group

                for section in small_section_list:
                    group_dict[index_name][product_group_name][section_name][section] = small_section_item_list
                    
    return group_dict
                    
group_dict_merged = merge_small_groups(group_dict)

In [None]:
# get reverse dictoinary address book -> article id -> group
def create_article_group_dict():
    item_group_dict = {}
    groups = dfi.groupby(['index_name', 'product_group_name', 'section_name', 'product_type_name']).groups
    for key, values in groups.items():
        for vl in values:
            item_group_dict[vl] = key
    
    return item_group_dict

item_group_dict = create_article_group_dict()    

## Define train and test sets

In [None]:
# the train dataset is df
train_set = (df.loc[df.t_dat >= pd.Timestamp(train_begin_date)]).loc[df.t_dat <= pd.Timestamp(train_end_date)]
val_set = (df.loc[df.t_dat > pd.Timestamp(train_end_date)]).loc[df.t_dat <= pd.Timestamp(val_end_date)]

# drop columns we do not need
train_set = train_set[['customer_id', 'article_id', 'price']]
val_set = val_set[['t_dat', 'customer_id', 'article_id', 'price']]

# convert prices to implicit feedback (1 for interaction)
train_set.loc[:, 'price'] = 1



In [None]:
train_set.shape[0]

### Make training data more memory efficient

In [None]:
# use more memory efficient ids
id_to_index_dict = dict(zip(dfu["customer_id"], dfu.index))
index_to_id_dict = dict(zip(dfu.index, dfu["customer_id"]))
id2inxArt = dict(zip(dfi["article_id"], dfi.index))
inx2idArt = dict(zip(dfi.index, dfi["article_id"]))

train_set["customer_id"] = train_set["customer_id"].map(id_to_index_dict)
train_set["customer_id"]= train_set["customer_id"].astype('int32')
train_set["article_id"] = train_set["article_id"].map(id2inxArt)

# for switching back for submission use:
# sub["customer_id"] = sub["customer_id"].map(index_to_id_dict)

# create needed constants
all_itemIds = dfi['article_id'].map(id2inxArt)
num_customers = dfu['customer_id'].unique().shape[0]
num_items = dfi['article_id'].unique().shape[0]

## Define data processing class for training data and create training data object

In [None]:
# Goal: develope negative sampling procedure
# plan: for now just add neg examples if possible.
def select_neg_items(article_id, u, user_item_set):
    neg_item_list = []
    
    # get dictionary address from article id
    group_address = item_group_dict[article_id]
    # get item set for potential neg sampels
    group = group_dict_merged[group_address[0]][group_address[1]][group_address[2]][group_address[3]]

    curr_neg_sample_num = neg_sample_num
    if len(group) <= neg_sample_num:
        if len(group) != 0:
            curr_neg_sample_num = len(group) - 1
        else:
            curr_neg_sample_num = 0
            
    for _ in range(curr_neg_sample_num):
        to_add = True
        
        negative_item = np.random.choice(group)
        
        index = 0
        while ((u, negative_item) in user_item_set) or (negative_item in neg_item_list):
            index += 1
            negative_item = np.random.choice(group)
            if index > len(group) * 3:
                to_add = False
#                 print("Couldn't find a neg item.")
                break
        
        if to_add:
            neg_item_list.append(negative_item)
       
    return neg_item_list
        
    

In [None]:
class TrainDataset(Dataset):
    """Transactions PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the transactions data
        all_itemIds (list): List containing all article ids
    
    """

    def __init__(self, ratings, all_itemIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_itemIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, df, all_itemIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(df['customer_id'], df['article_id']))

        # simple negative sampling - choose four random articles as negative samples.
        # TODO try a different approach here
        print(f'The length of the set is {len(user_item_set)}')
        index = 0
        for u, i in user_item_set:
            index += 1
            if index % 100000 == 0:
                print(f'The index is {index}')
            users.append(u)
            items.append(i)
            labels.append(1)
            
            neg_item_list = select_neg_items(i, u, user_item_set)
            for negative_item in neg_item_list:
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:

data = TrainDataset(train_set, all_itemIds)


In [None]:
torch.save(data, "/kaggle/working/dataset-2020.pt")

In [None]:
new_data = torch.load("/kaggle/working/dataset-2020.pt")
new_data[0]

## Define the model class

In [None]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_itemIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_itemIds = all_itemIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(data,
                          batch_size=512, num_workers=2) # increased worker number

## Train the model

In [None]:
# instantiate the model
model = NCF(num_customers, num_items, train_set, all_itemIds)

In [None]:
trainer = pl.Trainer(
    enable_checkpointing=True,
    max_epochs=5, 
    gpus=1, 
    reload_dataloaders_every_epoch=True, 
    progress_bar_refresh_rate=50, 
    logger=False,
    default_root_dir="/kaggle/working"
)

trainer.fit(model)

### Save the model as a file - do not forget to download

In [None]:
trainer.save_checkpoint("/kaggle/working/curr_checkpoint-2020.ckpt")

In [None]:
# export the model
filepath = '/kaggle/working/model.pt'
torch.save(model.state_dict(), filepath)

## code to load the model
# loaded_model = NCF(num_customers, num_items, train_set, all_itemIds)
# loaded_model.load_state_dict(torch.load(filepath))

In [None]:
PATH = ""
model = NCF.load_from_checkpoint(PATH)
trainer = pl.Trainer()
trainer.fit(model, ckpt_path=PATH)

## Create potential predictions set for each customer in validation set

In [None]:
val_set = (df.loc[df.t_dat >= pd.Timestamp(train_end_date)]).loc[df.t_dat <= pd.Timestamp(val_end_date)]
val_set = val_set[['t_dat', 'customer_id', 'article_id', 'price']]
val_set["customer_id"] = val_set["customer_id"].map(id_to_index_dict)
val_set["customer_id"]= val_set["customer_id"].astype('int32')
val_set["article_id"] = val_set["article_id"].map(id2inxArt)
val_set = val_set[['t_dat','customer_id','article_id']]
val_set.head(10)

In [None]:
# FIND PREVIOUS PURCHASES
# the train dataset is df
train_set = (df.loc[df.t_dat >= pd.Timestamp(train_begin_date)]).loc[df.t_dat <= pd.Timestamp(train_end_date)]

# drop columns we do not need
train_set = train_set[['t_dat', 'customer_id', 'article_id', 'price']]

# use more memory efficient ids
train_set["customer_id"] = train_set["customer_id"].map(id_to_index_dict)
train_set["customer_id"]= train_set["customer_id"].astype('int32')
train_set["article_id"] = train_set["article_id"].map(id2inxArt)

train_set.head()

tmp = train_set.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train_set = train_set.merge(tmp,on=['customer_id'],how='left')
train_set['diff_dat'] = (train_set.max_dat - train_set.t_dat).dt.days

train_set = train_set.loc[train_set['diff_dat']<=days_max_diff] # UNCOMMENT
print('Train shape:',train_set.shape)

# only leave customers in the validation set
USERS = val_set['customer_id'].unique()
train_set = train_set[train_set['customer_id'].isin(USERS)]
train_set.head()
train_set = train_set.drop_duplicates(['customer_id','article_id'])
train_set.head(10)

In [None]:
train_set[['customer_id', 'article_id']].groupby('customer_id').count().mean()

In [None]:
# add items purchased together
# USE PANDAS TO MAP COLUMN WITH DICTIONARY
pairs = np.load('../input/hmitempairs/pairs_cudf.npy',allow_pickle=True).item()

In [None]:
# convert article ids
# map pairs to reduced article id, then map train set to the mapped pairs.
pairs_converted = {}
for key in pairs:
    # change the key into article numeric id
    article_id = '0' + str(key)
    inx = id2inxArt[article_id]
    value = id2inxArt['0' + str(pairs[key])]
    pairs_converted[inx] = value

train_set['article_id2'] = train_set.article_id.map(pairs_converted)

# RECOMMENDATION OF PAIRED ITEMS
train2 = train_set[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)
train2.shape[0]

In [None]:
train2.groupby('customer_id').count().mean()

In [None]:
# CONCATENATE PAIRED ITEM RECOMMENDATION AFTER PREVIOUS PURCHASED RECOMMENDATIONS
train_set = train_set[['customer_id','article_id']]
train_set = pd.concat([train_set,train2],axis=0,ignore_index=True)
train_set.article_id = train_set.article_id.astype('int32')
train_set = train_set.drop_duplicates(['customer_id','article_id'])

In [None]:
train_set.groupby('customer_id').count().mean()
train_set = train_set.reset_index(drop=True)

# create dictionary -> customer: list of items
train_preds = train_set.groupby(['customer_id'])
pred_list_dict = {}

for key in train_preds.groups.keys():
    value = list(train_preds.get_group(key)['article_id'])
    pred_list_dict[key] = value
    
pred_list_dict

In [None]:
data_path = '../input/h-and-m-personalized-fashion-recommendations/'
csv_train = f'{data_path}transactions_train.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])

# RECOMMENT LAST MONTH'S MOST POPULAR ITEMS
pop_train = (df.loc[df.t_dat >= pd.Timestamp(one_month_before_val_date)]).loc[df.t_dat <= pd.Timestamp(train_end_date)]
pop_items = dict(pop_train['article_id'].value_counts()[:num_pop_items]).keys()
pop_items_converted = []
for item in pop_items:
    pop_items_converted.append(id2inxArt[item])
    
pop_items_converted

## Make prediction

In [None]:
# create predictions for all the customers in the validation set
# return submission-formatted dataframe
def make_predictions(val_subm_set, item_candidate_list, pop_items):
    """
    customer_list: a list of customer ids as strings for which to make predictions
    """
    # get validation set and turn int test_user_item_set

    predictions = val_subm_set.copy(deep=True)
    customer_list = list(val_subm_set['customer_id'])

    index = 0
    # tqdm is a progress bar
    for index, u in tqdm(enumerate(customer_list)):
        # get predictions
        try:
            test_items = item_candidate_list[u] + pop_items
        except:
            test_items = pop_items
            
        test_items_length = len(test_items)
            
        predicted_labels = np.squeeze(model(torch.tensor([u]*test_items_length), 
                                        torch.tensor(test_items)).detach().numpy())
    
        top12_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:12].tolist()]
    
        # convert predictions to article ids and make them a string
        top_articles_ids = list(map(inx2idArt.get, top12_items))
    
        top_articles_ids_str = ' '.join([str(k) for k in top_articles_ids])
    
        # add row to data frame
        predictions.iloc[index] = [index_to_id_dict[u], top_articles_ids_str]
    
#         index += 1
#         if index > 10:
#             break
    
    return predictions

#### Convert test set into submission like format for evaluatoin

In [None]:
# convert val set to submission file equilvalent
valid = val_set.groupby('customer_id').article_id.apply(list).reset_index()
valid = valid.rename({'article_id':'prediction'},axis=1)
valid['prediction'] =\
    valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))
valid.head()

In [None]:
valid.shape[0]

In [None]:
predictions = make_predictions(valid, pred_list_dict, pop_items_converted)
                               
predictions.head()

## Evaluate on test set

In [None]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

#### Calculate MAP@12

In [None]:
predictions.head()

In [None]:
data_path = '../input/h-and-m-personalized-fashion-recommendations/'
csv_train = f'{data_path}transactions_train.csv'
df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
val_set = (df.loc[df.t_dat >= pd.Timestamp(train_end_date)]).loc[df.t_dat <= pd.Timestamp(val_end_date)]
val_set = val_set[['customer_id','article_id']]

# convert val set to submission file equilvalent
valid = val_set.groupby('customer_id').article_id.apply(list).reset_index()
valid = valid.rename({'article_id':'prediction'},axis=1)
valid['prediction'] =\
    valid.prediction.apply(lambda x: ' '.join([str(k) for k in x]))
valid.head()

In [None]:
# calculate MAP@12
# valid_test = valid.head(7)
# predictions_test = predictions.head(7)

pred_arranged = predictions.set_index('customer_id').loc[valid.customer_id].reset_index()
evaluation = mapk( valid.prediction.str.split(), pred_arranged.prediction.str.split(), k=12)
print(f"Test set results: MAP@12 = {evaluation}.")

## Create submission file and submit

In [None]:
# create submission file
sub = make_predictions(list(df_sub['customer_id']))

# submit competition
# To be done

