#### A study using X-Wines Slim version dataset with recommender systems metrics
#### Rogério Xavier de Azambuja (rogerio.xavier@farroupilha.ifrs.edu.br)
#### Dataset X-Wines from https://github.com/rogerioxavier/X-Wines

In [1]:
# Opening required packages for processing
# basic libs
import sys
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm

# Cornac[1] framework
import cornac
from cornac.eval_methods import RatioSplit
from cornac.models import PMF, NMF, MMMF, BPR, IBPR, ItemKNN, MLP, NeuMF, HFT, CTR, VAECF, BiVAECF 
from cornac.metrics import MAE, RMSE, Precision, Recall, FMeasure, NDCG, AUC, MAP, MRR, MSE
# if the model uses autoencoder may need a word dictionary
from cornac.data import Reader
from cornac.data import TextModality
from cornac.data.text import BaseTokenizer

import tensorflow as tf

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")
print(f"Tensorflow version: {tf.__version__}")

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.
System version: 3.7.15 (default, Nov  9 2022, 10:44:37) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.14.2
Tensorflow version: 1.15.0


In [2]:
# Opening X-Wines Slim version dataset
slimWines = pd.read_csv('XWines_100k_Slim_wines.csv', low_memory=False, encoding="utf-8", memory_map=True)
slimRatings = pd.read_csv('XWines_150k_Slim_ratings.csv', low_memory=False, encoding="utf-8", memory_map=True)
len(slimWines), len(slimRatings)

(1007, 150000)

In [3]:
print(slimRatings.UserID.nunique(), "users who rated",slimRatings.WineID.nunique(), "wines")
print(f"in the period between {slimRatings.Date.min()} and {slimRatings.Date.max()}")
print("Total ratings:", len(slimRatings), f"5-stars on wine vintages since {slimRatings.Vintage.min()}, including non-vintage 'N.V.'")

10561 users who rated 1007 wines
in the period between 2012-04-19 20:46:00 and 2021-12-31 23:58:52
Total ratings: 150000 5-stars on wine vintages since 1950, including non-vintage 'N.V.'


In [4]:
SEED = 42
VERBOSE = False

# X-Wines in list format (int, int, float)
print("Sample in list format")
XW_150k = []
for _,row in tqdm(slimRatings.iterrows(), total=len(slimRatings)):
    XW_150k.append(eval(f"('{int(row.UserID)}', '{int(row.WineID)}', {float(row.Rating)})"))
    
XW_150k[:5], len(XW_150k)

Sample in list format


  0%|          | 0/150000 [00:00<?, ?it/s]

([('1160798', '101008', 2.0),
  ('1213955', '111429', 4.5),
  ('1230088', '100064', 4.0),
  ('2052301', '111495', 4.0),
  ('1355634', '168128', 3.5)],
 150000)

In [5]:
print("Dictionary creation for autoencoder uses")
item_ids = []; item_dict = []  
for wine in tqdm(slimRatings.WineID.unique()):
    item_ids.append(eval(f"'{int(wine)}'"))
    dict = slimWines[slimWines.WineID==wine][['WineName', 'WineryName', 'RegionName']].values[0]
    item_dict.append( (dict[0]+" "+dict[1]+" "+dict[2]).replace("  "," ") )
item_ids[:5], len(item_ids), item_dict[:5], len(item_dict)

Dictionary creation for autoencoder uses


  0%|          | 0/1007 [00:00<?, ?it/s]

(['101008', '111429', '100064', '111495', '168128'],
 1007,
 ['Campo Largo Branco Seco de Mesa Zanlorenzi Serra Gaúcha',
  'Saint-Estèphe (Grand Cru Classé) Château Montrose Saint-Estèphe',
  'Réserve Brut CHANDON Brazil Serra Gaúcha',
  'Pauillac (Grand Cru Classé) Château Duhart-Milon Pauillac',
  'Brut Rosé CHANDON Argentina Mendoza'],
 1007)

In [6]:
# Instantiate a TextModality, it makes it convenient to work with text auxiliary information
item_text_modality = TextModality(corpus=item_dict, ids=item_ids, 
                                  tokenizer=BaseTokenizer(stop_words='english'), max_vocab=5000, max_doc_freq=0.5)

In [7]:
# split the X-Wines dataset based on ratio   
rsXW = RatioSplit(data=XW_150k, test_size=0.2, rating_threshold=5.0, 
                  exclude_unknowns=True, item_text=item_text_modality, seed=SEED, verbose=VERBOSE)

In [8]:
# define models
K=10
LEARNING_RATE = 0.001  # @param
VOCAB_SIZE = 5000  # @param
NUM_FACTORS = 25  # @param
AE_LAYERS  = [500, 100]  # @param  [500, 100]
MLP_LAYERS = [128, 64, 32, 16]  # @param
ACTIVATION = "tanh"  # @param ["adam", "adagrad", tanh", "sigmoid", "relu", "leaky_relu"]
LIKELIHOOD = "bern"  # @param ["bern", "mult", "gaus", "pois"]
NEG_SAMPLES = 3  # @param
NUM_EPOCHS = 20  # @param 
BATCH_SIZE = 100  # @param

models = [
    PMF(name='PMF (D)'),
    PMF(name='PMF (M)', k=K, variant='linear', seed=SEED, verbose=VERBOSE),
   
    NMF(name='NMF (D)'),
    NMF(name='NMF (M)', k=NUM_FACTORS, max_iter=200, learning_rate=LEARNING_RATE, lambda_reg=0.1, use_bias=True,
        seed=SEED, verbose=VERBOSE),
    
    MMMF(name='MMMF (D)'),
    MMMF(name='MMMF (M)', learning_rate=0.01, seed=SEED, verbose=VERBOSE),
    
    BPR(name='BPR (D)'),
    BPR(name='BPR (M)', learning_rate=0.01, seed=SEED, verbose=VERBOSE),

    IBPR(name='IBPR (D)'),
    IBPR(name='IBPR (M)',k=NUM_FACTORS, learning_rate=LEARNING_RATE, lamda=0.01, verbose=VERBOSE),
    
    ItemKNN(name="ItemKNN (D)"), 
    ItemKNN(name="ItemKNN (M)", k=K, similarity="pearson", mean_centered=True, seed=SEED, verbose=VERBOSE),

    MLP(name='MLP (D)'),
    MLP(name='MLP (M)', lr=0.01, batch_size=BATCH_SIZE, num_neg=NEG_SAMPLES, seed=SEED, verbose=VERBOSE),  

    NeuMF(name='NeuMF/NCF (D)'),
    NeuMF(name='NeuMF/NCF (M)', num_factors=K, layers=MLP_LAYERS, act_fn=ACTIVATION, num_neg=NEG_SAMPLES,
          batch_size=BATCH_SIZE, lr=0.01, seed=SEED, verbose=VERBOSE),
    
    HFT(name='HFT (D)'),
    HFT(name='HFT (M)', k=NUM_FACTORS, max_iter=40, grad_iter=5, lambda_text=0.01, vocab_size=VOCAB_SIZE,
        seed=SEED, verbose=VERBOSE),

    CTR(name='CTR (D)'),
    CTR(name='CTR (M)', k=NUM_FACTORS, max_iter=50, seed=SEED, verbose=VERBOSE),  
    
    VAECF(name='VAECF (D)'),
    VAECF(name='VAECF (M)', k=NUM_FACTORS, seed=SEED, verbose=VERBOSE, use_gpu=True),
    
    BiVAECF(name='BiVAECF (D)'),
    BiVAECF(name='BiVAECF (M)', encoder_structure=AE_LAYERS, likelihood=LIKELIHOOD, seed=SEED, verbose=VERBOSE),
]

# define metrics to evaluate the models
metrics = [MAE(), RMSE(), Precision(k=K), Recall(k=K), NDCG(k=K), FMeasure(k=K), AUC(), MAP(), MRR()]

In [9]:
# put it together in an experiment for all models, voilà!
cornac.Experiment(eval_method=rsXW, models=models, metrics=metrics, user_based=True).run()

  0%|          | 0/1004 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Learning completed!


  0%|          | 0/100 [00:00<?, ?it/s]

Learning completed!

TEST:
...
              |    MAE |   RMSE |    AUC |  F1@10 |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------------- + ------ + ------ + ------ + ------ + ------ + ------ + ------- + ------------ + --------- + --------- + --------
PMF (D)       | 0.3721 | 0.4205 | 0.9110 | 0.0580 | 0.1028 | 0.1104 |  0.1298 |       0.0330 |    0.2718 |    2.4444 |   2.9662
PMF (M)       | 0.4100 | 0.4722 | 0.9179 | 0.0550 | 0.1132 | 0.1244 |  0.1340 |       0.0314 |    0.2524 |    4.3686 |   2.9784
NMF (D)       | 0.3958 | 0.4480 | 0.8619 | 0.0244 | 0.0543 | 0.0614 |  0.0538 |       0.0140 |    0.1105 |    0.4911 |   3.5894
NMF (M)       | 0.3541 | 0.4025 | 0.8953 | 0.0508 | 0.0903 | 0.0992 |  0.1109 |       0.0290 |    0.2348 |    3.4113 |   3.5756
MMMF (D)      | 2.8310 | 2.8665 | 0.8225 | 0.0189 | 0.0300 | 0.0339 |  0.0340 |       0.0108 |    0.0883 |    0.1575 |   3.6599
MMMF (M)      | 2.8310 | 2.8665 | 0.8796 | 0.0311 | 0.0548 | 0.0593 |  0.

In [10]:
# Or, for each model individually, it is necessary to execute the commands below to create an individual log
models = [
    NMF(name='NMF (D)'),
    NMF(name='NMF (M)', k=NUM_FACTORS, max_iter=200, learning_rate=LEARNING_RATE, lambda_reg=0.1, use_bias=True,
        seed=SEED, verbose=VERBOSE),
]
cornac.Experiment(eval_method=rsXW, models=models, metrics=metrics, user_based=True).run()


TEST:
...
        |    MAE |   RMSE |    AUC |  F1@10 |    MAP |    MRR | NDCG@10 | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------ + ------ + ------ + ------ + ------ + ------- + ------------ + --------- + --------- + --------
NMF (D) | 0.3905 | 0.4419 | 0.8665 | 0.0273 | 0.0583 | 0.0649 |  0.0606 |       0.0156 |    0.1243 |    0.5393 |   4.0109
NMF (M) | 0.3541 | 0.4025 | 0.8953 | 0.0508 | 0.0903 | 0.0992 |  0.1109 |       0.0290 |    0.2348 |    3.4504 |   3.6459

