In [177]:
import random
import math
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
from collections import OrderedDict
from tqdm import tqdm
import torch.nn.functional as F
import implicit

## Data Loading

In [178]:
colnames = ['user_id', 'movie_id', 'rating', 'time']
df_train = pd.read_csv('ml-100k/u1.base', delimiter='\t', header=None, names=colnames)
df_test = pd.read_csv('ml-100k/u1.test', delimiter='\t', header=None, names=colnames)
df_full = pd.concat([df_train, df_test])
df_full.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


## Preprocessing modules

In [179]:
def binary_rating(data):
    temp = data.copy()
    temp['rating']=np.where(temp['rating']>0,1,0)
    return temp

def sample_negative(data,item_list):
    interact_status=data.groupby('user_id')['movie_id'].apply(set).reset_index().rename(columns={'movie_id':'interacted_items'})
    interact_status['negative_items']=interact_status['interacted_items'].apply(lambda x: item_pool-x)
    interact_status['negative_sample']=interact_status['negative_items'].apply(lambda x: random.sample(list(x),99)) #Sample negative examples
    #interact_status['negative_sample']=interact_status['negative_sample'].apply(set)
    return interact_status[['user_id', 'negative_items', 'negative_sample']]

def split_loo(ratings):
    """train/test split using leave one out strategy"""

    ratings['rank_latest'] = ratings.groupby(['user_id'])['time'].rank(
        method='first', ascending=False)
    
    test = ratings[ratings['rank_latest'] == 1]
    train = ratings[ratings['rank_latest'] > 1]

    assert train['user_id'].nunique() == test['user_id'].nunique()
    return train[['user_id', 'movie_id', 'rating']], test[['user_id', 'movie_id', 'rating']]

class UserItemRatingDataset(Dataset):
    """Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
    
    def __init__(self, user_tensor, item_tensor, target_tensor):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor

    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.user_tensor.size(0)



## Utils

In [180]:
### utils
def resume_checkpoint(model, model_dir, device_id):
    state_dict = torch.load(
        model_dir,
        map_location=lambda storage, loc: storage.cuda(device=device_id)
        ) 
    model.load_state_dict(state_dict)

In [181]:
df_process=binary_rating(df_full)
df_process

Unnamed: 0,user_id,movie_id,rating,time
0,1,1,1,874965758
1,1,2,1,876893171
2,1,3,1,878542960
3,1,4,1,876893119
4,1,5,1,889751712
...,...,...,...,...
19995,458,648,1,886395899
19996,458,1101,1,886397931
19997,459,934,1,879563639
19998,460,10,1,882912371


In [182]:
user_pool = set(df_process['user_id'].unique())
item_pool = set(df_process['movie_id'].unique())

In [183]:
df_negative=sample_negative(df_process,item_pool)
df_negative

Unnamed: 0,user_id,negative_items,negative_sample
0,1,"{273, 274, 275, 276, 277, 278, 279, 280, 281, ...","[704, 460, 1463, 942, 847, 930, 1544, 680, 157..."
1,2,"{2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 1...","[130, 1042, 1015, 233, 112, 1023, 1526, 1074, ..."
2,3,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1527, 1098, 129, 1099, 1424, 474, 1116, 930, ..."
3,4,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15...","[1042, 624, 1158, 1405, 1570, 1668, 291, 69, 7..."
4,5,"{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[1549, 331, 1600, 1632, 1420, 1261, 1268, 983,..."
...,...,...,...
938,939,"{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...","[1125, 915, 604, 277, 674, 1559, 1398, 761, 75..."
939,940,"{1, 2, 3, 5, 6, 10, 11, 13, 15, 16, 17, 18, 19...","[492, 1396, 1162, 1500, 569, 1523, 268, 1064, ..."
940,941,"{2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, ...","[1378, 185, 525, 1436, 1169, 1580, 553, 1281, ..."
941,942,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[802, 549, 369, 1211, 1084, 147, 138, 1145, 10..."


In [184]:
df_train, df_test = split_loo(df_process)


In [185]:
# add negative data for implicit data (no rating data)
num_neg = 2
df_train = pd.merge(df_train, df_negative[['user_id', 'negative_items']], on='user_id')
df_train['negatives'] = df_train['negative_items'].apply(lambda x: random.sample(list(x), num_neg))
df_test = pd.merge(df_test, df_negative[['user_id', 'negative_items']], on='user_id')
df_test['negatives'] = df_test['negative_items'].apply(lambda x: random.sample(list(x), num_neg))


In [186]:

users_train = df_train['user_id'].values.tolist()
items_train = df_train['movie_id'].values.tolist()
ratings_train = df_train['rating'].astype(float).values.tolist()


In [187]:
# Adding Item, Users, Rating to the pool of + ves examples
for i in range(num_neg):
    users_train += df_train['user_id'].values.tolist()
    items_train += df_train['negatives'].apply(lambda x: x[i]).values.tolist()
    ratings_train += [0]*len(df_train)

In [188]:
train_dataset = UserItemRatingDataset(
    user_tensor=torch.LongTensor(users_train),
    item_tensor=torch.LongTensor(items_train),
    target_tensor=torch.FloatTensor(ratings_train))

In [189]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

In [190]:
user_sample, item_sample, target_sample = next(iter(train_loader))

## GMF (Generalized Matrix Factorization)

In [191]:
gmf_config = {
    'alias': 'gmf_factor8neg2-implict',
    'num_epoch': 20,
    'batch_size': 256,
    'optimizer': 'adam',
    'adam_lr': 1e-3,
    'num_users': len(user_pool) + 1,
    'num_items': len(item_pool) + 1,
    'latent_dim': 8,
    'num_negative': num_neg,
    'l2_regularization': 0.01,
    'use_cuda': True,
    'device_id': 0,
    'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
}

class GMF(torch.nn.Module):
    def __init__(self, config):
        super(GMF, self).__init__()
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim = config['latent_dim']

        self.embedding_user = torch.nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        
        self.embedding_item = torch.nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.latent_dim)

        self.affine_output = torch.nn.Linear(
            in_features=self.latent_dim, out_features=1)
        
        self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):

        # (m, latent_dim)
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        element_product = torch.mul(user_embedding, item_embedding) # Element wise product

        # (m, 1)
        logits = self.affine_output(element_product)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass

### testing
gmf_model = GMF(gmf_config)
result = gmf_model(user_sample, item_sample)
print(f"Result Size: {result.size()}")

Result Size: torch.Size([256, 1])


## MLP Layer
- https://pytorch.org/tutorials/recipes/recipes/defining_a_neural_network.html

In [192]:
# fit trained gmf layer into MLP

mlp_config = {
    'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
    'num_epoch': 20,
    'batch_size': 256,
    'optimizer': 'adam',
    'adam_lr': 1e-3,
    'num_users': len(user_pool) + 1,
    'num_items': len(item_pool) + 1,
    'latent_dim': 8,
    'num_negative': 2,
    'layers': [16, 64, 32, 16, 8],  # layers[0] is the concat of latent user & latent item
    'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
    'use_cuda': False,
    'device_id': 7,
    'pretrain': False,
    'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model',
    'pretrain_mf': True,
    'pretrain_mf_loc': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
    }

class MLP(torch.nn.Module):
    def __init__(self, config, gmf_model=None):

        super(MLP, self).__init__()
        self.config = config
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim = config['latent_dim']

        self.embedding_user = torch.nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.latent_dim)
        
        self.embedding_item = torch.nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.latent_dim)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.affine_output = torch.nn.Linear(in_features=config['layers'][-1], out_features=1)
        self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):

        # (m, latent_dim)
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)

        # the concat latent vector, (m, latent_dim * 2)
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # Concatenation of vectors

        # (m, latent_dim * 2) --> (m, 64, 32, ..., 8)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            vector = self.fc_layers[idx](vector)
            vector = torch.nn.ReLU()(vector)
            # vector = torch.nn.BatchNorm1d()(vector)
            # vector = torch.nn.Dropout(p=0.5)(vector)

        # (m, 1)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass

### testing
mlp_model = MLP(mlp_config, gmf_model)
result = mlp_model(user_sample, item_sample)
print(f"Result Size: {result.size()}")

Result Size: torch.Size([256, 1])


## Combining MLP and GMF = NeuMF

In [199]:
neumf_config = {
    'alias': 'pretrain_neumf_factor8neg4',
    'num_epoch': 200,
    'batch_size': 256,
    'optimizer': 'adam',
    'adam_lr': 1e-3,
    'num_users': len(user_pool) + 1,
    'num_items': len(item_pool) + 1,
    'latent_dim_mf': 8,
    'latent_dim_mlp': 8,
    'num_negative': 2,
    'layers': [16, 64, 32, 16, 8,120,200,130],
    'l2_regularization': 0.01,
    'use_cuda': False,
    'device_id': 7,
    'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model',
    'pretrain': True,
    'pretrain_mf_loc': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
    'pretrain_mlp_loc': 'checkpoints/{}'.format('mlp_factor8neg4_Epoch100_HR0.5606_NDCG0.2463.model'),
    }

class NeuMF(torch.nn.Module):
    def __init__(self, config, gmf_model=None, mlp_model=None):

        super(NeuMF, self).__init__()
        self.config = config
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim_mf = config['latent_dim_mf']
        self.latent_dim_mlp = config['latent_dim_mlp']

        self.embedding_user_mlp = torch.nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        
        self.embedding_item_mlp = torch.nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)
        
        self.embedding_user_mf = torch.nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        
        self.embedding_item_mf = torch.nn.Embedding(
            num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)
        
        self.fc_layers = torch.nn.ModuleList() # Same as python list, but for pytorch
        for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.affine_output = torch.nn.Linear(
            in_features=config['layers'][-1] + config['latent_dim_mf'], out_features=1)
        self.logistic = torch.nn.Sigmoid()


    def forward(self, user_indices, item_indices):

        # (m, latent_dim_mlp)
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)

        # (m, latent_dim_mf)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        # (m, latent_dim_mlp * 2)
        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)

        # (m, latent_dim_mf)
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf)

        # (m, latent_dim_mlp * 2) --> (m, 64) --> (m, latent_dim_mlp)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            mlp_vector = torch.nn.ReLU()(mlp_vector)

        # (m, enumerate + latent_dim_mf)
        vector = torch.cat([mlp_vector, mf_vector], dim=-1)

        # (m, 1)
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

    def init_weight(self):
        pass

### testing
neumf_model = NeuMF(neumf_config, gmf_model, mlp_model)
pred = neumf_model(user_sample, item_sample)
print(f"Result Size: {pred.size()}")

Result Size: torch.Size([256, 1])


## Prediction 

In [194]:
### loss function
criterion = torch.nn.BCELoss()
loss = criterion(pred.view(-1), target_sample)
print(loss)

tensor(0.6863, grad_fn=<BinaryCrossEntropyBackward0>)


In [195]:
# for evaluation, give 1 positive sample and N negative sample to each user
# the model will rank the N + 1 items

g_truth = dict(zip(df_test['user_id'], df_test['movie_id']))
d1 = df_test[['user_id', 'negatives']].copy()
d1 = d1.explode('negatives')
d1['rating'] = 0
d1.rename(columns={'negatives' : 'movie_id'}, inplace=True)
df_eval = pd.concat([d1, df_test[['user_id', 'movie_id', 'rating']]])
df_eval.head()

Unnamed: 0,user_id,movie_id,rating
0,3,166,0
0,3,1356,0
1,4,990,0
1,4,1031,0
2,5,1408,0


## Inference on Evaluation Set

In [196]:
### rank according to score
pred = neumf_model(
    torch.LongTensor(df_eval['user_id'].values.tolist()), 
    torch.LongTensor(df_eval['movie_id'].values.tolist())
    )
df_eval['score'] = pred.view(-1).detach().numpy()
df_eval['rank'] = df_eval.groupby('user_id')['score'].rank(method='first', ascending=False)
df_eval.sort_values(['user_id', 'rank'], inplace=True)
df_eval['g_truth'] = df_eval['user_id'].map(g_truth)
df_eval.head()

Unnamed: 0,user_id,movie_id,rating,score,rank,g_truth
815,1,1014,0,0.510884,1.0,74
815,1,74,1,0.498887,2.0,74
815,1,1202,0,0.486119,3.0,74
816,2,281,1,0.498252,1.0,281
816,2,552,0,0.49793,2.0,281


## Evaluation Metrics

In [197]:
# hit ratio
def cal_hit_ratio(df, top_k=2):
    """Hit Ratio @ top_K"""

    top_k = df[df['rank'] <= top_k].copy()
    # golden items hit in the top_K items
    test_in_top_k = top_k[top_k['movie_id'] == top_k['g_truth']]  
    return len(test_in_top_k) * 1.0 / df['user_id'].nunique()

hr_ratio = cal_hit_ratio(df_eval)
print('HR Ratio', hr_ratio)

# ndcg
def cal_ndcg(df, top_k=2):
    top_k = df[df['rank'] <= top_k].copy()
    test_in_top_k = top_k[top_k['movie_id'] == top_k['g_truth']]  

    # the rank starts from 1
    # if rank is 1, then ndcg = 1
    test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(
        lambda x: math.log(2) / math.log(1 + x)
        )
    return test_in_top_k['ndcg'].sum() * 1.0 / df['user_id'].nunique()

ndcg = cal_ndcg(df_eval)
print('NDCG', ndcg)

HR Ratio 0.6606574761399788
NDCG 0.53502486839495


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(
